In [11]:
import numpy as np
import pandas as pd
from sklearn import cross_validation, datasets, svm
from sklearn import cross_validation, tree, metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.externals.six.moves import xrange
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.grid_search import GridSearchCV
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [12]:
dataset = pd.read_csv('proc_data.csv')
dataset['id'] = dataset['id'].astype(int)

In [13]:
train = dataset.query('type == "train"')         
test = dataset.query('type == "test"')  

In [14]:
train.count()

age              891
cabin            204
class            891
embarked         889
fare             891
id               891
name             891
parch            891
sex              891
sibsp            891
survived         891
ticket           891
type             891
surname          891
title            891
embarked_at_C    891
embarked_at_Q    891
embarked_at_S    891
female           891
male             891
class_1.0        891
class_2.0        891
class_3.0        891
in_cabin         891
family_size      891
family_id        891
family_id_int    891
Master           891
Miss             891
Mr               891
Mrs              891
Rev              891
farebin          891
dtype: int64

In [15]:
features = ['age', 'class', 'fare', 'family_size', 'embarked_at_C', 'embarked_at_Q', 'embarked_at_S', 'female', 'class_1.0', 'class_2.0', 'class_3.0', 'in_cabin', 'Master', 'Miss', 'Mrs']

In [16]:
X = train[features].values
y = train['survived'].values

In [17]:
X_test = test[features].values
test_ids = test['id']
test_ids = pd.DataFrame(test_ids.reset_index()['id'])

In [31]:
rnd = RandomForestClassifier(n_estimators = 1000, max_depth = 5, min_samples_split = 2, min_samples_leaf=3, oob_score=True, random_state = 100)
rnd.fit(X, y)
y_predicted = rnd.predict(X)
print accuracy_score(y, y_predicted)
print rnd.oob_score_

0.849607182941
0.824915824916


In [28]:
y_test_predicted = rnd.predict(X_test)
predictions = pd.DataFrame(y_test_predicted)
submission = pd.concat([test_ids, predictions], axis=1)
submission.columns = ['PassengerId', 'Survived']
submission['Survived'] = submission['Survived'].astype(int)
submission.to_csv('rf md 1000.csv', sep = ',', index = False)

In [21]:
%%time
rf = RandomForestClassifier(oob_score = True)
params = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 15],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [1, 2, 3, 4, 5],
    'random_state': [3, 4, 5, 6, 7],
    'n_estimators':[1000]
}
gs = GridSearchCV(rf, params, scoring='accuracy', n_jobs=-1)
gs.fit(X, y)

CPU times: user 8.86 s, sys: 1.57 s, total: 10.4 s
Wall time: 10min 40s


In [23]:
#sorted(gs.grid_scores_, key=lambda x: x.mean_validation_score, reverse=True)
gs

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'min_samples_split': [1, 2, 3, 4, 5], 'n_estimators': [1000], 'random_state': [3, 4, 5, 6, 7], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 15], 'min_samples_leaf': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='accuracy', verbose=0)

**Gradient Boosting Classifier**

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(learning_rate=0.1, max_features=0.5, n_estimators=1000, max_depth=3, subsample=0.5)

scores = cross_validation.cross_val_score(gb, X, y, cv=5, n_jobs=4, scoring='accuracy')
print("Gradient Boosted Trees CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(scores.min(), scores.mean(), scores.max()))

Gradient Boosted Trees CV scores:
min: 0.793, mean: 0.825, max: 0.859


**Grid Search to find optimal parameters**

In [33]:
%%time
gb = GradientBoostingClassifier()
params = {
    'learning_rate': [0.05, 0.1, 0.5, 0.8, 1],
    'max_features': [0.5, 1],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample':[0.2, 0.5, 0.8, 1],
    'n_estimators':[100, 500]
}
gs = GridSearchCV(gb, params, cv=5, scoring='accuracy', n_jobs=4)
gs.fit(X, y)

CPU times: user 4.67 s, sys: 921 ms, total: 5.59 s
Wall time: 3min 54s


In [35]:
sorted(gs.grid_scores_, key=lambda x: x.mean_validation_score, reverse=True)

[mean: 0.83838, std: 0.02130, params: {'max_features': 0.5, 'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.8},
 mean: 0.83838, std: 0.02162, params: {'max_features': 1, 'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 4, 'subsample': 0.5},
 mean: 0.83726, std: 0.02931, params: {'max_features': 1, 'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.8},
 mean: 0.83726, std: 0.02393, params: {'max_features': 1, 'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.5},
 mean: 0.83726, std: 0.02996, params: {'max_features': 0.5, 'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.2},
 mean: 0.83726, std: 0.02535, params: {'max_features': 0.5, 'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.8},
 mean: 0.83614, std: 0.01839, params: {'max_features': 0.5, 'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 1},
 mean: 0.83614, std: 0.01759, params

**Run model using recommended hyper parameters**

In [None]:
gb = GradientBoostingClassifier(learning_rate=0.1, max_features=0.5, n_estimators=500, max_depth=3, subsample=0.5)
gb.fit(X, y)
y_test_predicted = gb.predict(X_test)
predictions = pd.DataFrame(y_test_predicted)
submission = pd.concat([test_ids, predictions], axis=1)
submission.columns = ['PassengerId', 'Survived']
submission['Survived'] = submission['Survived'].astype(int)
submission.to_csv('submission13.csv', sep = ',', index = False)