# Adding second RF model to Stacked Generalization

In order to add a second model to the stacked generalization, I need a way to optimize the blending of the first two models to the blending error, not the individual error of the second model. 

Let's create a second method using Random Forests:

In [1]:
import os
os.chdir('C:\Users\Lundi\Documents\Programming\Python\Kaggle\Titanic - 2015\Stacked Generalization')
import metaLearning as meta

meta_learning = meta.metaLearning

In [58]:
os.chdir('C:\Users\Lundi\Documents\Programming\Python\Kaggle\Titanic - 2015')
#os.chdir('/Users/alexsutherland/Documents/Programming/Python/Kaggle/Titanic---2015')

import TitanicPreprocessor as tp
import TitanicPredictor as tpred
import sklearn.ensemble as skl_ensemble
import sklearn.grid_search as skl_gs
import sklearn.cross_validation as skl_cv
import sklearn.linear_model as skl_lm
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

X, y, X_test, X_test_ids = tp.getData()
X_train, X_probe, y_train, y_probe = skl_cv.train_test_split(X, y, test_size=0.25, random_state = 0)

## Creating a param grid for Random Forest

In [5]:
rf_clf = skl_ensemble.RandomForestClassifier()
rf_param_grid = [
    {'n_estimators': [10,100], 'criterion': ['gini','entropy'], 'max_depth': [None,1,3,5], 'min_samples_leaf':[1], 'max_features':['auto',2,3,4,5]}
]

gs_rf_clf = skl_gs.GridSearchCV(rf_clf, param_grid=rf_param_grid, cv=5)

In [6]:
gs_rf_clf.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [10, 100], 'max_features': ['auto', 2, 3, 4, 5], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 1, 3, 5], 'min_samples_leaf': [1]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

## Blending with logistic regression model

Let's first create a loop that runs through a parameter grid and predict with each model

In [60]:
for params in skl_gs.ParameterGrid(rf_param_grid):
    rf_clf.set_params(**params)
    rf_clf.fit(X_train, y_train)
    y_rf_pred_probs = rf_clf.predict_proba(X_probe)
    y_rf_pred_probs = pd.DataFrame({'rf_pred_prob_survival': y_rf_pred_probs.transpose()[1], 'id':X_probe.index})

Now, I need to combine the random forest predictions with the logistic regression params and predict with a linear regression model:

In [62]:
log_reg_probe_pred_probs = pd.read_csv('Stacked Generalization/Data/log_reg_probe_pred.csv')
combined_pred_prob = pd.merge(log_reg_probe_pred_probs, y_rf_pred_probs, on='id')
combined_pred_prob = combined_pred_prob.set_index('id')
combined_pred_prob.head(2)

Unnamed: 0_level_0,lr_prob_survival,rf_pred_prob_survival
id,Unnamed: 1_level_1,Unnamed: 2_level_1
495,0.227339,0.166219
648,0.212925,0.112776


With both predictions, let's run a logistic regression model:

In [67]:
lr_clf = skl_lm.LogisticRegression()
lr_clf.fit(combined_pred_prob, y_probe)

blend_cv_scores = skl_cv.cross_val_score(lr_clf, combined_pred_prob, y_probe, cv=10)
print("%0.2f+/-%0.2f" % (np.mean(blend_cv_scores), np.std(blend_cv_scores)))

0.83+/-0.05


Putting this all together:

In [76]:
lr_clf = skl_lm.LogisticRegression()

blend_results_list = []

for params in skl_gs.ParameterGrid(rf_param_grid):
    rf_clf.set_params(**params)
    rf_clf.fit(X_train, y_train)
    y_rf_pred_probs = rf_clf.predict_proba(X_probe)
    y_rf_pred_probs = pd.DataFrame({'rf_pred_prob_survival': y_rf_pred_probs.transpose()[1], 'id':X_probe.index})
    
    combined_pred_prob = pd.merge(log_reg_probe_pred_probs, y_rf_pred_probs, on='id')
    combined_pred_prob = combined_pred_prob.set_index('id')
    
    lr_clf.fit(combined_pred_prob, y_probe)

    blend_cv_scores = skl_cv.cross_val_score(lr_clf, combined_pred_prob, y_probe, cv=10)
    
    blend_results_list.append([np.mean(blend_cv_scores), params])
blend_results = pd.DataFrame(blend_results_list)
blend_results.columns = [0, 'params']

### Generating probe predictions for RF

In [81]:
best_blend = blend_results.ix[blend_results[0].idxmax(),:]

best_rf_clf = skl_ensemble.RandomForestClassifier()
best_rf_clf.set_params(**best_blend[1])

best_rf_clf.fit(X_train, y_train)


y_probe_pred = best_rf_clf.predict(X_probe)
y_probe_pred_probs = best_rf_clf.predict_proba(X_probe)

y_pred_prob_survived = pd.DataFrame(pd.DataFrame(y_probe_pred_probs)[1])
y_pred_prob_survived['id'] = X_probe.index
y_pred_prob_survived.columns = ['rf_prob_survival','id']
y_pred_prob_survived.to_csv('Stacked Generalization/Data/rf_probe_pred.csv', index=False)

## Blending test predictions with RF

In [80]:
best_blend = blend_results.ix[blend_results[0].idxmax(),:]

best_rf_clf = skl_ensemble.RandomForestClassifier()
best_rf_clf.set_params(**best_blend[1])

best_rf_clf.fit(X, y)
y_pred = best_rf_clf.predict(X_test)
y_pred_probs = best_rf_clf.predict_proba(X_test)

y_pred_prob_survived = pd.DataFrame(pd.DataFrame(y_pred_probs)[1])
y_pred_prob_survived['test_id'] = X_test_ids
y_pred_prob_survived.columns = ['rf_perc_survival','test_id']
y_pred_prob_survived.to_csv('Stacked Generalization/Data/rf_test_pred.csv', index=False)