## Building an Ensemble Predictor with RF and AdaBoost

In [18]:
import os
os.chdir('C:\Users\Lundi\Documents\Programming\Python\Kaggle\Titanic - 2015')

import TitanicPreprocessor as tp
import TitanicPredictor as tpred

import sklearn.ensemble as skl_ensemble
import sklearn.cross_validation as skl_cv
import sklearn.preprocessing as skl_pre
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

X, y, X_real_test, X_test_ids = tp.getData()

In [6]:
X_train, X_test, y_train, y_test = skl_cv.train_test_split(X, y, test_size = 0.2)

## Random Forest

In [19]:
rf_clf = skl_ensemble.RandomForestClassifier(
    max_features=4, 
    n_estimators=1000, 
    criterion ='entropy', 
    max_depth=5, 
    min_samples_leaf=2
)

In [7]:
y_pred_rf = skl_cv.cross_val_predict(rf_clf, X, y, cv=10)

## AdaBoost

#### Stump

In [20]:
dt_stump_clf = DecisionTreeClassifier(criterion='entropy', max_depth=1, min_samples_leaf=1)
ada_dt_stump_clf = AdaBoostClassifier(
    base_estimator=dt_stump_clf,
    learning_rate=0.01,
    n_estimators=2883)

In [4]:
y_pred_ada_stump = skl_cv.cross_val_predict(ada_dt_stump_clf, X, y, cv=10)

KeyboardInterrupt: 

#### Larger Tree

In [21]:
dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=1)
ada_dt_clf = AdaBoostClassifier(
    base_estimator=dt_clf,
    learning_rate=0.001,
    n_estimators=2766)

In [9]:
y_pred_ada_dt = skl_cv.cross_val_predict(ada_dt_clf, X, y, cv=10)

## Combining Predictions

In [28]:
combined_predictions = pd.DataFrame([y_pred_rf, y_pred_ada_stump, y_pred_ada_dt]).T
combined_predictions.columns = ['random_forest', 'ada_stump','ada_dt']
combined_predictions

Unnamed: 0,random_forest,ada_stump,ada_dt
0,0,0,0
1,1,1,1
2,0,1,0
3,1,1,1
4,0,0,0
5,0,0,0
6,0,0,0
7,0,0,1
8,0,1,0
9,1,1,1


In [29]:
combined_predictions.corr()

Unnamed: 0,random_forest,ada_stump,ada_dt
random_forest,1.0,0.830122,0.874387
ada_stump,0.830122,1.0,0.825108
ada_dt,0.874387,0.825108,1.0


These are heavily correlated. Most likely, the ensemble won't do significantly better

In [30]:
def getAccuracies(combined_y, y_actual):
    weighted_combined_y = combined_y.copy()
    weighted_combined_y['random_forest_2'] = weighted_combined_y['random_forest']
    weighted_combined_y['ada_dt_2'] = weighted_combined_y['ada_dt']
    combined_y['majority_vote'] = combined_y.apply(lambda row: row.value_counts().idxmax(), axis=1)
    return combined_y.apply(lambda col_predictions: np.mean(y_actual == col_predictions), axis=0)

In [31]:
getAccuracies(combined_predictions, y)

random_forest    0.833895
ada_stump        0.808081
ada_dt           0.823793
majority_vote    0.831650
dtype: float64

## Predicting unknowns

In [22]:
ada_dt_clf.fit(X, y)
ada_dt_stump_clf.fit(X, y)
rf_clf.fit(X, y)

y_pred_ada_dt = ada_dt_clf.predict(X_real_test)
y_pred_ada_stump = ada_dt_stump_clf.predict(X_real_test)
y_pred_rf = rf_clf.predict(X_real_test)

combined_predictions = pd.DataFrame([y_pred_rf, y_pred_ada_stump, y_pred_ada_dt]).T
combined_predictions.columns = ['random_forest', 'ada_stump','ada_dt']

combined_predictions['majority_vote'] = combined_predictions.apply(lambda row: row.value_counts().idxmax(), axis=1)

In [25]:
y_pred_final = combined_predictions['majority_vote']

test_output = pd.DataFrame([X_test_ids, y_pred_final]).T
test_output.columns = ['PassengerId','Survived']
test_output.head()

test_output.to_csv('Data/test_output.csv', index=False)

Tied with top: Test Accuracy = 0.79426