In [12]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [6]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [7]:
# factorisation de la colonne Sex
df['Sex'] = df['Sex'].factorize()[0]


# 1. Decision Tree avec train_test_split

In [11]:
# X, y:
y = df['Survived']
X = df.drop(['Survived', 'Name'], axis = 1)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.75)

# modèle
modelDTC = DecisionTreeClassifier()
modelDTC.fit(X_train, y_train)

# score
print("Accuracy score on train set : ", modelDTC.score(X_train, y_train))
print("Accuracy score on test set", modelDTC.score(X_test, y_test))

Accuracy score on train set :  0.9834586466165414
Accuracy score on test set 0.7747747747747747


# 2. GridSearchCV

In [16]:
# dictionnaire de valeurs
dico = {'max_depth': range(1, 51),
        'min_samples_leaf': range(1,16),
        'min_samples_split': [2, 5, 7, 10, 15, 30]}

# modèle
grid = GridSearchCV(modelDTC, dico)
grid.fit(X,y)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 51),
                         'min_samples_leaf': range(1, 16),
                         'min_samples_split': [2, 5, 7, 10, 15, 30]})

In [17]:
print('Best score:', grid.best_score_)
print('Best Params', grid.best_params_)

Best score: 0.830895702405891
Best Params {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 7}


Les meilleurs HP sont : max_depth = 5, min_samples_leaf = 1 et min_samples_split = 7. 

L'accuracy score avec ces params est de 0.83 

# 3. RandomizedSearch

In [18]:
randomz = RandomizedSearchCV(modelDTC, dico, n_iter= 200)
randomz.fit(X, y)

RandomizedSearchCV(estimator=DecisionTreeClassifier(), n_iter=200,
                   param_distributions={'max_depth': range(1, 51),
                                        'min_samples_leaf': range(1, 16),
                                        'min_samples_split': [2, 5, 7, 10, 15,
                                                              30]})

In [19]:
print('Best score:', randomz.best_score_)
print('Best Params', randomz.best_params_)

Best score: 0.8263759283945916
Best Params {'min_samples_split': 7, 'min_samples_leaf': 1, 'max_depth': 4}


Avec RandomizedSearch les meilleurs HP sont : max_depth = 4, min_samples_leaf = 1, et min_samples_split = 7

SEule la valeur de max_depth change mais le score ne varie pas beaucoup entre les deux tests.