In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [4]:
# Chargement des données
data = pd.read_csv('data/titanic.csv')
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Name                     887 non-null    object 
 3   Sex                      887 non-null    object 
 4   Age                      887 non-null    float64
 5   Siblings/Spouses Aboard  887 non-null    int64  
 6   Parents/Children Aboard  887 non-null    int64  
 7   Fare                     887 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


In [7]:
# Préparation des données
# Suppression de la colonne 'Name'
data = data.drop(['Name'], axis=1)

# Encodage de la colonne 'Sex'
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Création de la caractéristique 'FamilySize'
data['FamilySize'] = data['Siblings/Spouses Aboard'] + data['Parents/Children Aboard']

# Suppression des colonnes 'Siblings/Spouses Aboard' et 'Parents/Children Aboard'
data = data.drop(['Siblings/Spouses Aboard', 'Parents/Children Aboard'], axis=1)

In [10]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,FamilySize
0,0,3,0,22.0,7.25,1
1,1,1,1,38.0,71.2833,1
2,1,3,1,26.0,7.925,0
3,1,1,1,35.0,53.1,1
4,0,3,0,35.0,8.05,0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    887 non-null    int64  
 1   Pclass      887 non-null    int64  
 2   Sex         887 non-null    int64  
 3   Age         887 non-null    float64
 4   Fare        887 non-null    float64
 5   FamilySize  887 non-null    int64  
dtypes: float64(2), int64(4)
memory usage: 41.7 KB


In [12]:
# Séparation des caractéristiques et de la cible
X = data.drop('Survived', axis=1)
y = data['Survived']

In [13]:
# Normalisation des données
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [14]:
# Division des données en données d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Création du modèle
classifier = LogisticRegression(max_iter=1000)

# Entraînement du modèle
classifier.fit(X_train, y_train)

# Prédiction sur les jeux d'entraînement et de test
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)

# Calcul des scores d'accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Accuracy score du jeu d'entraînement (premier essai): {train_accuracy}")
print(f"Accuracy score du jeu de test (premier essai): {test_accuracy}")

Accuracy score du jeu d'entraînement (premier essai): 0.8194640338504936
Accuracy score du jeu de test (premier essai): 0.7471910112359551


In [16]:
# Deuxième essai
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entraînement du modèle
classifier.fit(X_train, y_train)

# Prédiction sur les jeux d'entraînement et de test
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)

# Calcul des scores d'accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Accuracy score du jeu d'entraînement (deuxième essai): {train_accuracy}")
print(f"Accuracy score du jeu de test (deuxième essai): {test_accuracy}")

Accuracy score du jeu d'entraînement (deuxième essai): 0.8194640338504936
Accuracy score du jeu de test (deuxième essai): 0.7471910112359551


In [17]:
# Cross-validation avec 6 parties
cv_scores = cross_val_score(classifier, X, y, cv=6)

print(f"Scores de la CrossValidation: {cv_scores}")
print(f"Moyenne des scores de la CrossValidation: {cv_scores.mean()}")
print(f"Écart-type des scores de la CrossValidation: {cv_scores.std()}")

Scores de la CrossValidation: [0.77027027 0.79054054 0.81081081 0.7972973  0.80405405 0.80952381]
Moyenne des scores de la CrossValidation: 0.7970827970827972
Écart-type des scores de la CrossValidation: 0.013865991371652486
