In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [4]:
# Chargement des données
data = pd.read_csv('data/titanic.csv')
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [5]:
# Sélection des caractéristiques et de la cible
features = ['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']
target = 'Survived'

# Conversion de la colonne 'Sex' en valeurs numériques
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Gestion des valeurs manquantes (par exemple, remplacer les NaN dans 'Age' par la médiane)
data['Age'].fillna(data['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)


In [6]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,0,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,1,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,1,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,1,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,0,35.0,0,0,8.05


In [7]:
# Séparation des données en ensembles d'entraînement et de test
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Création et entraînement du modèle Decision Tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = clf.predict(X_test)

# Calcul de l'accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {accuracy:.4f}")

Accuracy score: 0.7416


In [9]:
from sklearn.model_selection import GridSearchCV

# Définition des hyperparamètres à tester
param_grid = {
    'max_depth': list(range(1, 51)),
    'min_samples_leaf': list(range(1, 16)),
    'min_samples_split': [2, 5, 7, 10, 15, 30]
}

# Création de l'objet GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Lancement de la recherche
grid_search.fit(X_train, y_train)

# Meilleurs paramètres et meilleur score
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_
print(f"Meilleurs paramètres (GridSearch): {best_params_grid}")
print(f"Meilleur score (GridSearch): {best_score_grid:.4f}")

Meilleurs paramètres (GridSearch): {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2}
Meilleur score (GridSearch): 0.8307


In [10]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Définition des hyperparamètres à tester
param_dist = {
    'max_depth': randint(1, 51),
    'min_samples_leaf': randint(1, 16),
    'min_samples_split': [2, 5, 7, 10, 15, 30]
}

# Création de l'objet RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, n_iter=200, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

# Lancement de la recherche
random_search.fit(X_train, y_train)

# Meilleurs paramètres et meilleur score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_
print(f"Meilleurs paramètres (RandomSearch): {best_params_random}")
print(f"Meilleur score (RandomSearch): {best_score_random:.4f}")

Meilleurs paramètres (RandomSearch): {'max_depth': 24, 'min_samples_leaf': 4, 'min_samples_split': 7}
Meilleur score (RandomSearch): 0.8223
