In [156]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import VotingClassifier
import numpy as np

%matplotlib inline

# Datasets

In [141]:
train = pd.read_csv('../data/train_final.csv')
test = pd.read_csv('../data/test_final.csv')

target_column = "Survived"
id_column = "PassengerId"

features = dict(\
                Pclass=True,\
                Fare=False,\
                TitleClass=True,\
                FamilySize=False,\
                AgeClass=True,\
                SexClass=True,\
                EmbarkedClass=True,\
                pctFamilySurvived=False,\
                nFamilySurvived=False,\
                nFamilyDied=False
               )
# segura esse combo de lists
features_columns = list(list(zip(*list(features.items())))[0])
categorical_features = list(zip(*list(features.items())))[1]

train_Y = train[target_column]
train_X = train[features_columns]

test_X = test[features_columns]

# Gerando evaluation set

In [149]:
eval_skf = StratifiedKFold(train_Y, random_state=1, n_folds=4)
dev_ixs, eval_ixs = list(eval_skf)[0]

In [150]:
eval_X = train_X.loc[eval_ixs, :]
eval_Y = train_Y.loc[eval_ixs]

dev_X = train_X.loc[dev_ixs, :]
dev_Y = train_Y.loc[dev_ixs]

# treinando

In [157]:
enc = OneHotEncoder(sparse=False, categorical_features=categorical_features)
anova_filter = SelectKBest(f_regression, k=10)
rf1 = RandomForestClassifier(random_state=1)
rf2 = RandomForestClassifier(random_state=1)
voting = VotingClassifier([("rf1", rf1), ("rf2", rf2)])

skf = StratifiedKFold(dev_Y, random_state=1)

anova_rf = Pipeline([\
                     ('hot', enc),
                     ('anova', anova_filter),\
                     ('voting', voting)])

param_grid = {"rf1__n_estimators": [100],\
         "rf1__min_samples_split": [10],\
         "rf1__min_samples_leaf": [10],\
         "rf1__criterion": ["entropy"],\
         "rf2__n_estimators": [10, 50],\
         "rf2__min_samples_split": [3, 5],\
         "rf2__min_samples_leaf": [3, 5],\
         "rf2__criterion": ["gini"],\
             }


cv = GridSearchCV(anova_rf, param_grid=param_grid, cv=skf)

cv.fit(dev_X, dev_Y)

ValueError: dictionary update sequence element #0 has length 0; 2 is required

# analisando desempenho no evaluation set

In [153]:
print(classification_report(eval_Y, cv.predict(eval_X)))

             precision    recall  f1-score   support

          0       0.98      0.97      0.97       138
          1       0.95      0.97      0.96        86

avg / total       0.97      0.97      0.97       224



# predizendo

In [154]:
cv.fit(train_X, train_Y)

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=3, shuffle=False, random_state=1),
       error_score='raise',
       estimator=Pipeline(steps=[('hot', OneHotEncoder(categorical_features=(False, True, True, False, False, False, True, True, False, True),
       dtype=<class 'float'>, handle_unknown='error', n_values='auto',
       sparse=False)), ('anova', SelectKBest(k=10, score_func=<function f_regression at 0x0000001B67477EA0>)), (...estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'rf__criterion': ['gini', 'entropy'], 'rf__min_samples_split': [3, 5, 10], 'rf__n_estimators': [50, 100], 'rf__min_samples_leaf': [3, 5, 10], 'rf__class_weight': [{0: 1, 1: 1}, {0: 4, 1: 1}]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [155]:
predictions = cv.predict(test_X)

submission = pd.DataFrame({target_column: predictions, id_column: test[id_column]})

submission.to_csv("../submissions/Testando algumas tools do scikit v6.csv", index=False)