In [15]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [5]:
data = pd.read_csv('train.csv')

In [8]:
data.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
417,418,1,2,"Silven, Miss. Lyyli Karoliina",female,18.0,0,2,250652,13.0,,S
766,767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C


In [66]:
numeric_features = ['Age','Fare']
numeric_transformer = Pipeline(steps = [
                                        ('imputer', SimpleImputer(strategy='median')),
                                        ('scaler', StandardScaler())
                                       ]
                              )
categorical_features = ['Embarked', 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps = 
                                           [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                           ]
                                  )
preprocessor = ColumnTransformer(transformers = 
                                                 [('num',numeric_transformer, numeric_features),
                                                 ('cat',categorical_transformer, categorical_features)]
                                )
clf = Pipeline(steps = 
                      [('preprocessor', preprocessor),
                       ('classifier', RandomForestClassifier())
                       #('classifier', LogisticRegression(solver='lbfgs'))
                      ]
              )

In [64]:
X = data.drop(['PassengerId','Survived','Name'], axis=1)
y = data.Survived

In [67]:
params = {
    'preprocessor__num__imputer__strategy':['mean','median'],
    'classifier__n_estimators': [200, 500, 700],
    'classifier__max_features': ['sqrt',None],
    'classifier__max_depth': [ None,7,6,5,4,3,2]
}

grid_search = GridSearchCV(clf, params, cv=10)
grid_search.fit(X, y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                   

In [68]:
grid_search.best_score_

0.8271604938271605

In [73]:
data_pred = pd.read_csv('test.csv')
X_pred = data_pred.drop(['PassengerId','Name','SibSp', 'Parch', 'Cabin'], axis=1)

In [74]:
y_pred = grid_search.predict(X_pred)

In [75]:
PassengerId = pd.Series(range(892,1310))
result_df = pd.DataFrame(columns=['PassengerId','Survived'])
result_df['PassengerId'] = PassengerId
result_df['Survived'] = y_pred
result_df = result_df.astype(int)
result_df.to_csv('submissions.csv', index=False)
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [72]:
grid_search.best_estimator_

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             