In [2]:
# Read data
import pandas as pd

train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
from sklearn.model_selection import train_test_split

X = train_data.drop(['Survived'], axis=1)
X = X.drop(['Name', 'Ticket', 'Cabin'], axis=1)
y = train_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder())
])

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

preprocessing_pipeline = ColumnTransformer([
    ('cat_transformer', cat_pipeline, ['Sex', 'Embarked']),
], remainder=num_pipeline)

pipeline = Pipeline([
    ('preprocessing_pipeline', preprocessing_pipeline),
    ('model', None),
])


In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'model': [RandomForestClassifier(), SVC(), LogisticRegression(), KNeighborsClassifier()],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END .....................model=RandomForestClassifier(); total time=   0.3s
[CV] END .....................model=RandomForestClassifier(); total time=   0.4s
[CV] END .....................model=RandomForestClassifier(); total time=   0.2s
[CV] END .....................model=RandomForestClassifier(); total time=   0.2s
[CV] END .....................model=RandomForestClassifier(); total time=   0.4s
[CV] END ........................................model=SVC(); total time=   0.0s
[CV] END ........................................model=SVC(); total time=   0.0s
[CV] END ........................................model=SVC(); total time=   0.0s
[CV] END ........................................model=SVC(); total time=   0.0s
[CV] END ........................................model=SVC(); total time=   0.1s
[CV] END .........................model=LogisticRegression(); total time=   0.0s
[CV] END .........................model=LogisticR

In [11]:
param_grid = {
    'model': [RandomForestClassifier()],
    'model__n_estimators': [10, 50, 100, 200],
    'model__max_depth': [5, 10, 20, 50],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

model = grid_search.best_estimator_

{'model': RandomForestClassifier(), 'model__max_depth': 10, 'model__n_estimators': 200}
0.8328277356446371


In [13]:
test_data = pd.read_csv('test.csv')
y_pred = model.predict(test_data)

submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': y_pred
})

submission.to_csv('submission.csv', index=False)