In [100]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings

### Load data

In [101]:
warnings.filterwarnings('ignore')
train_titanic_df = pd.read_csv('../data/train.csv')
test_titanic_df = pd.read_csv('../data/test.csv')
train_titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Show data info

In [102]:
train_titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Show the number of missing value in each column

In [103]:
print(train_titanic_df.isna().sum().to_markdown())

|             |   0 |
|:------------|----:|
| PassengerId |   0 |
| Survived    |   0 |
| Pclass      |   0 |
| Name        |   0 |
| Sex         |   0 |
| Age         | 177 |
| SibSp       |   0 |
| Parch       |   0 |
| Ticket      |   0 |
| Fare        |   0 |
| Cabin       | 687 |
| Embarked    |   2 |


In [104]:
def preprocessing(titanic_df):
    '''Preprocesses dataset by handling missing values, creating new features, 
    and encoding categorical variables.

    - Extracts passenger titles from the "Name" column and stores them in "Name_Title".
    - Fills missing "Age" values with the mean age of passengers sharing the same "Pclass" and "Name_Title".
    - Creates a new feature "Family_Size" by summing "Parch" (parents/children) and "SibSp" (siblings/spouse).
    - Encodes the "Sex" column into numerical values (0 for male, 1 for female).
    - Fills missing "Embarked" values with the most common embarkation point ('S').
    - Fills missing "Fare" values with the mean fare of passengers in the same "Pclass". 
    '''

    titanic_df['Name_Title'] = train_titanic_df.Name.apply(lambda x: x.split(',')[1]
                                                        .split('.')[0].strip())

    age_to_imput = titanic_df.groupby(['Pclass', 'Name_Title'])['Age'].transform('mean')
    titanic_df['Age'] = titanic_df['Age'].fillna(age_to_imput)
    
    titanic_df['Family_Size'] = titanic_df['Parch'] + titanic_df['SibSp']

    sex_mapping = {"male": 0, "female": 1}
    titanic_df['Sex'] = titanic_df['Sex'].map(sex_mapping)

    titanic_df['Embarked'] = titanic_df['Embarked'].fillna('S')
    
    fare_to_imput = titanic_df.groupby('Pclass')['Fare'].transform('mean')
    titanic_df['Fare'] = titanic_df['Fare'].fillna(fare_to_imput)
       
    return titanic_df

### Process train and test set

In [105]:
train_titanic_df = preprocessing(train_titanic_df)
test_titanic_df = preprocessing(test_titanic_df)

### Show the missing value of train set after processing

In [106]:
print(train_titanic_df.isna().sum().to_markdown())

|             |   0 |
|:------------|----:|
| PassengerId |   0 |
| Survived    |   0 |
| Pclass      |   0 |
| Name        |   0 |
| Sex         |   0 |
| Age         |   0 |
| SibSp       |   0 |
| Parch       |   0 |
| Ticket      |   0 |
| Fare        |   0 |
| Cabin       | 687 |
| Embarked    |   0 |
| Name_Title  |   0 |
| Family_Size |   0 |


In [107]:
label_enc = LabelEncoder()
# train_titanic_df['AgeBin'] = pd.qcut(train_titanic_df['Age'], 4)
# train_titanic_df['Age_enc'] = label_enc.fit_transform(train_titanic_df['AgeBin'])

# test_titanic_df['AgeBin'] = pd.qcut(test_titanic_df['Age'], 4)
# test_titanic_df['Age_enc'] = label_enc.fit_transform(test_titanic_df['AgeBin'])

train_titanic_df['FareBin'] = pd.qcut(train_titanic_df['Fare'], 5)
train_titanic_df['Fare_enc'] = label_enc.fit_transform(train_titanic_df['FareBin'])

test_titanic_df['FareBin'] = pd.qcut(test_titanic_df['Fare'], 5)
test_titanic_df['Fare_enc'] = label_enc.fit_transform(test_titanic_df['FareBin'])


# Get columns from train
cols = pd.get_dummies(train_titanic_df['Embarked'], prefix='Embarked', drop_first=True).columns

# Encode both with same columns
X_train_embarked_df = pd.get_dummies(train_titanic_df['Embarked'], prefix='Embarked')[cols].astype(int)
X_test_embarked_df = pd.get_dummies(test_titanic_df['Embarked'], prefix='Embarked')[cols].astype(int)


In [108]:
y = train_titanic_df['Survived']
passengerId = test_titanic_df['PassengerId']

X_train = train_titanic_df[['Pclass', 'Sex',	'Age', 'Family_Size', 'Fare_enc']]
X_test = test_titanic_df[['Pclass', 'Sex',	'Age', 'Family_Size', 'Fare_enc']]

# Ajouter les colonnes encodées aux DataFrames d'origine
X_train = pd.concat([X_train, X_train_embarked_df], axis=1)
X_test = pd.concat([X_test, X_test_embarked_df], axis=1)

X_train.head()

Unnamed: 0,Pclass,Sex,Age,Family_Size,Fare_enc,Embarked_Q,Embarked_S
0,3,0,22.0,1,0,0,1
1,1,1,38.0,1,4,0,0
2,3,1,26.0,0,1,0,1
3,1,1,35.0,1,4,0,1
4,3,0,35.0,0,1,0,1


In [109]:
std_scaler = StandardScaler()
X = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

In [110]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

In [111]:
params = {
    'n_estimators':[10, 25, 40],
    'max_depth':[4, 7, 10],
    'min_samples_split': [2, 5, 10],
}

rfc = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rfc, param_grid=params, cv=5, n_jobs=-1, scoring='roc_auc')

grid_search.fit(X_tr, y_tr)
model = grid_search.best_estimator_

y_p_test = model.predict(X_te)

score = accuracy_score(y_te, y_p_test)
score

0.8212290502793296

In [112]:
y_pred = model.predict(X_test)

In [113]:
df = pd.DataFrame({
    'PassengerId': passengerId,
    'Survived': y_pred
})

df.to_csv("../data/gender_submission.csv", index=False)