In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

%matplotlib inline

# Datasets

In [2]:
train = pd.read_csv('../data/train_final.csv')

In [34]:
train = pd.read_csv('../data/train_final.csv')
test = pd.read_csv('../data/test_final.csv')

target_column = "Survived"
id_column = "PassengerId"

features = dict(\
                Pclass=True,\
                Sex=True,\
                AgeImputedByPclassAndSex=False,\
                SibSp=False,\
                Parch=False,\
                Fare=False,\
                FamilySize=False,\
                EmbarkedImputedByPclassAndSex=True,\
                #Title=True,\ -- removido -> imputar?
                NDiedPerFamilyNamePclass=False,\
                NSurvivedPerFamilyNamePclass=False,\
                DiffSurvivedDiedPerFamilyNamePclass=False,\
                AgeClassPerCut4Bins=True,\
                AgeClassPerQCut4Quartiles=True,\
                AgeClassPerFaixaEtaria=True,\
                AgeClassPerChuto=True\
               )
# segura esse combo de lists
features_columns = list(list(zip(*list(features.items())))[0])
categorical_features = list(zip(*list(features.items())))[1]

train_Y = train[target_column]
train_X = pd.get_dummies(train[features_columns])

test_X = pd.get_dummies(test[features_columns])

# Gerando evaluation set

In [35]:
eval_skf = StratifiedKFold(train_Y, random_state=1, n_folds=4)
dev_ixs, eval_ixs = list(eval_skf)[0]

In [36]:
eval_X = train_X.loc[eval_ixs, :]
eval_Y = train_Y.loc[eval_ixs]

dev_X = train_X.loc[dev_ixs, :]
dev_Y = train_Y.loc[dev_ixs]

# treinando

In [71]:
rf = RandomForestClassifier(random_state=1)
param_grid = {"n_estimators": [50, 100, 200],\
              "min_samples_split": [3, 5, 10],\
              "min_samples_leaf": [3, 5, 10],\
              "criterion": ['gini', 'entropy']}

cv = GridSearchCV(rf, param_grid=param_grid, cv=3)

cv.fit(dev_X, dev_Y)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 100, 200], 'min_samples_split': [3, 5, 10], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [60]:
# usando get_dummies do pandas...
# TODO: adicionar as funções de transformação do passo anterior à pipeline
enc = OneHotEncoder(sparse=False, categorical_features=categorical_features)
anova_filter = SelectKBest(f_regression, k=20)
rf = RandomForestClassifier(random_state=1)
rf1 = RandomForestClassifier(random_state=1)
rf2 = RandomForestClassifier(random_state=1)
gb = GradientBoostingClassifier(random_state=1)
voting = VotingClassifier([("rf1", rf1), ("rf2", rf2), ("gb", gb)])

skf = StratifiedKFold(dev_Y, random_state=1)

anova_2rf1gb = Pipeline([\
                     #('hot', enc),
                     ('anova', anova_filter),\
                     ('voting', voting)])


param_grid_2rf1gb = {"voting__rf1__n_estimators": [100],\
         "voting__rf1__min_samples_split": [10],\
         "voting__rf1__min_samples_leaf": [10],\
         "voting__rf1__criterion": ["entropy"],\
         "voting__rf2__n_estimators": [10, 50],\
         "voting__rf2__min_samples_split": [3, 5],\
         "voting__rf2__min_samples_leaf": [3, 5],\
         "voting__rf2__criterion": ["gini"],\
         "voting__gb__learning_rate": [0.1, 0.05],\
         "voting__gb__n_estimators": [100, 500]
             }


cv_2rf1gb = GridSearchCV(anova_2rf1gb, param_grid=param_grid_2rf1gb, cv=skf)

cv_2rf1gb.fit(dev_X, dev_Y)

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 1 0], n_folds=3, shuffle=False, random_state=1),
       error_score='raise',
       estimator=Pipeline(steps=[('anova', SelectKBest(k=20, score_func=<function f_regression at 0x00000061E45D6EA0>)), ('voting', VotingClassifier(estimators=[('rf1', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
           ...subsample=1.0, verbose=0,
              warm_start=False))],
         voting='hard', weights=None))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'voting__rf2__criterion': ['gini'], 'voting__rf1__min_samples_split': [10], 'voting__gb__n_estimators': [100, 500], 'voting__rf2__min_samples_leaf': [3, 5], 'voting__rf2__n_estimators': [10, 50], 'voting__rf1__n_estimators': [100], 'voting__rf1__min_samples_leaf': [10], 'voting__rf2__min_samples_split': [3, 5], 'voting__rf1__criterion': ['entropy'], 'vot

# analisando desempenho no evaluation set

In [72]:
print(classification_report(eval_Y, cv.predict(eval_X)))

             precision    recall  f1-score   support

          0       0.96      0.99      0.98       138
          1       0.99      0.94      0.96        86

avg / total       0.97      0.97      0.97       224



In [73]:
cv.best_params_

{'criterion': 'gini',
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 100}

# predizendo

In [74]:
cv.fit(train_X, train_Y)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 100, 200], 'min_samples_split': [3, 5, 10], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [75]:
test_X.Fare.fillna(train_X.Fare.mean(), inplace=True)

In [76]:
predictions = cv.predict(test_X)

submission = pd.DataFrame({target_column: predictions, id_column: test[id_column]})

submission.to_csv("../submissions/Migrando do PDI pro python v2.csv", index=False)