In [15]:
# This is a proof of concept of sklearn pipelines, its a very useful way to make the code clear and 
# understandable, it was was inspired by the tutorial: 
# https://medium.com/data-hackers/como-usar-pipelines-no-scikit-learn-1398a4cc6ae9

In [None]:
#Importing Libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [4]:
# Reading dataset
df = pd.read_csv("train.csv")

# Droping not relevant columns
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

# Spliting into train and test
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1), 
                                                    df['Survived'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [6]:
#Making the pipeline, it will be made of 3 steps: one-hot enconding, replacing missing data for the mean and
#the tree model

model = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])

In [8]:
# running the pipeline
model.fit(X_train, y_train)

In [17]:
# Showing the results
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)


print("Train score: {}".format(train_score))
print("Test score: {}".format(test_score))

Train score: 0.8342696629213483
Test score: 0.7988826815642458


In [18]:
# Tuning hyperparameters using cross-validation and tree depth
parameters = {'tree__max_depth': [3, 4, 5]}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(model, param_grid=parameters, cv=kfold, n_jobs=-1)
grid.fit(X=df.drop(['Survived'], axis=1), y=df['Survived'])

# Best parameters
grid.best_params_ 

{'tree__max_depth': 3}