In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import classification_report

In [126]:
train = pd.read_csv('../data/train.csv', index_col='PassengerId')
test = pd.read_csv('../data/test.csv', index_col='PassengerId')

In [127]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex']

In [128]:
age_imputation = train.Age.mean()
fare_imputation = train.Fare.mean()

# train set
train.Age.fillna(age_imputation, inplace=True)

train_X = pd.get_dummies(train[features])
train_Y = train.Survived

# test set
test.Age.fillna(age_imputation, inplace=True)
test.Fare.fillna(fare_imputation, inplace=True)

test_X = pd.get_dummies(test[features])

In [64]:
rf_12 = RandomForestClassifier(random_state=55, n_jobs=-1)
rf_3 = RandomForestClassifier(random_state=44, n_jobs=-1)

pg_12 = {'n_estimators': [100, 250],
         'min_samples_split': [4, 8], 
         'max_features': [None, 'auto']}

pg_3 = {'n_estimators': [500, 1000],
        'min_samples_split': [4, 8], 
        'max_features': [None, 'auto']}

cv_12 = GridSearchCV(rf_12, pg_12)
cv_3 = GridSearchCV(rf_3, pg_3)

is_Pclass_12 = train_X.Pclass.isin([1, 2])
is_Pclass_3 = (train_X.Pclass == 3)
train_X_12 = train_X[is_Pclass_12]
train_X_3 = train_X[is_Pclass_3]
train_Y_12 = train_Y[is_Pclass_12]
train_Y_3 = train_Y[is_Pclass_3]

cv_12.fit(train_X_12, train_Y_12)
cv_3.fit(train_X_3, train_Y_3)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=44, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [500, 1000], 'min_samples_split': [4, 8], 'max_features': [None, 'auto']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [68]:
pred_train_12 = cv_12.predict(train_X_12)
pred_train_3 = cv_3.predict(train_X_3)

In [93]:
result_12 = pd.DataFrame({'PassengerId': train[is_Pclass_12].index, 'Survived': pred_train_12})
result_12.set_index(['PassengerId'], inplace=True)
result_3 = pd.DataFrame({'PassengerId': train[is_Pclass_3].index, 'Survived': pred_train_3})
result_3.set_index(['PassengerId'], inplace=True)

In [109]:
result = pd.concat([result_12, result_3])

In [111]:
result['train_Y'] = train_Y

In [114]:
print(classification_report(result.train_Y, result.Survived))

             precision    recall  f1-score   support

          0       0.92      0.97      0.94       549
          1       0.94      0.86      0.90       342

avg / total       0.93      0.92      0.92       891



## Submission

In [129]:
is_Pclass_12 = test.Pclass.isin([1, 2])
is_Pclass_3 = (test.Pclass == 3)

test_X_12 = test_X[is_Pclass_12]
test_X_3 = test_X[is_Pclass_3]

In [130]:
pred_test_12 = cv_12.predict(test_X_12)
pred_test_3 = cv_3.predict(test_X_3)

In [131]:
result_12 = pd.DataFrame({'PassengerId': test[is_Pclass_12].index, 'Survived': pred_test_12})
result_12.set_index(['PassengerId'], inplace=True)
result_3 = pd.DataFrame({'PassengerId': test[is_Pclass_3].index, 'Survived': pred_test_3})
result_3.set_index(['PassengerId'], inplace=True)

In [132]:
result = pd.concat([result_12, result_3])

In [134]:
result.to_csv('../submissions/Simples3.csv')