In [335]:
#import libraries
import pandas as pd

# for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#import sci-kit
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [336]:
titanic_df = pd.read_csv("titanic.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [337]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### EDA Process

In [338]:
# To make changes create new subset of the original dataset
titanic_clean_dt = titanic_df.copy()

# Fill the NaN and blanks to 0 and convert the Age type to integer 
titanic_clean_dt['Age'] = titanic_clean_dt['Age'].fillna(0).astype(int)

# Input missing values in 'Embarked' with the most frequent value (mode)
mode_embarked = titanic_clean_dt['Embarked'].mode()[0]
titanic_clean_dt['Embarked'] = titanic_clean_dt['Embarked'].fillna(mode_embarked)

# Delete the Cabin, Ticket and Name column from the Dataset as not required
titanic_clean_dt.drop(["PassengerId","Cabin","Ticket","Name"], axis=1, inplace=True)

titanic_clean_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    int32  
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(1), int32(1), int64(4), object(2)
memory usage: 52.3+ KB


### One-Hot Encoding
One-hot encoding is a technique used to ensure that categorical variables are better represented in the machine. Let's take a look at the "Sex" column

In [339]:
titanic_clean_dt["Sex"].unique()

array(['male', 'female'], dtype=object)

Machine Learning classifiers don't know how to handle strings. As a result, you need to convert it into a categorical representation. There are two main ways to go about this:

Label Encoding: Assigning, for example, 0 for "male" and 1 for "female". The problem here is it intrinsically makes one category "larger than" the other category.

One-hot encoding: Assigning, for example, [1, 0] for "male" and [0, 1] for female. In this case, you have an array of size (n_categories,) and you represent a 1 in the correct index, and 0 elsewhere. In Pandas, this would show as extra columns. For example, rather than having a "Sex" column, it would be a "Sex_male" and "Sex_female" column. Then, if the person is male, it would simply show as a 1 in the "Sex_male" column and a 0 in the "Sex_female" column.

There is a nice and easy method that does this in pandas: get_dummies()

In [340]:
titanic_clean_dt = pd.get_dummies(titanic_clean_dt, prefix="Sex", columns=["Sex"])
titanic_clean_dt.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male
0,0,3,22,1,0,7.25,S,False,True
1,1,1,38,1,0,71.2833,C,True,False
2,1,3,26,0,0,7.925,S,True,False
3,1,1,35,1,0,53.1,S,True,False
4,0,3,35,0,0,8.05,S,False,True


Now, we do the same to the "Embarked" column.

In [341]:
titanic_clean_dt = pd.get_dummies(titanic_clean_dt, prefix="Embarked", columns=["Embarked"])
titanic_clean_dt.head()
titanic_clean_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         891 non-null    int32  
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   Sex_female  891 non-null    bool   
 7   Sex_male    891 non-null    bool   
 8   Embarked_C  891 non-null    bool   
 9   Embarked_Q  891 non-null    bool   
 10  Embarked_S  891 non-null    bool   
dtypes: bool(5), float64(1), int32(1), int64(4)
memory usage: 42.8 KB


### Modeling

In [342]:
# Define features and target variable
X = titanic_clean_dt.drop("Survived", axis=1)
y = titanic_clean_dt["Survived"]
X.shape

(891, 10)

In [343]:
# Split the data into training (60%), development (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the features (optional but often useful)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_dev_scaled = scaler.transform(X_dev)
X_test_scaled = scaler.transform(X_test)

### Comparing models

In [344]:
# Train and evaluate models Function for implement to all models
def train_and_evaluate(model, X_train, X_dev, y_train, y_dev, name):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_dev_pred = model.predict(X_dev)
    
    train_accuracy = accuracy_score(y_train, y_train_pred)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    train_report = classification_report(y_train, y_train_pred)
    dev_report = classification_report(y_dev, y_dev_pred)
    
    print(f"Training Set Accuracy for {name}: {train_accuracy}")
    print(f"Training Set Classification Report for {name}:")
    print(train_report)
    
    print(f"Development Set Accuracy for {name}: {dev_accuracy}")
    print(f"Development Set Classification Report for {name}:")
    print(dev_report)
    
    return


## Train models

In [345]:
# Train a single Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate(dt_model, X_train_scaled, X_dev_scaled, y_train, y_dev, "Decision Tree")


Training Set Accuracy for Decision Tree: 0.9831460674157303
Training Set Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       333
           1       1.00      0.96      0.98       201

    accuracy                           0.98       534
   macro avg       0.99      0.98      0.98       534
weighted avg       0.98      0.98      0.98       534

Development Set Accuracy for Decision Tree: 0.7584269662921348
Development Set Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.79      0.84      0.82       113
           1       0.69      0.62      0.65        65

    accuracy                           0.76       178
   macro avg       0.74      0.73      0.73       178
weighted avg       0.75      0.76      0.76       178



In [346]:
# Bagging model
bagging_model = BaggingClassifier(estimator=dt_model, n_estimators=100, random_state=42)
train_and_evaluate(bagging_model, X_train_scaled, X_dev_scaled, y_train, y_dev, "Bagging Classifier")


Training Set Accuracy for Bagging Classifier: 0.9831460674157303
Training Set Classification Report for Bagging Classifier:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       333
           1       1.00      0.96      0.98       201

    accuracy                           0.98       534
   macro avg       0.99      0.98      0.98       534
weighted avg       0.98      0.98      0.98       534

Development Set Accuracy for Bagging Classifier: 0.7584269662921348
Development Set Classification Report for Bagging Classifier:
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       113
           1       0.68      0.63      0.66        65

    accuracy                           0.76       178
   macro avg       0.74      0.73      0.73       178
weighted avg       0.76      0.76      0.76       178



In [347]:
# Random Forest model
rf_classifier = RandomForestClassifier(random_state=42)
train_and_evaluate(rf_classifier, X_train_scaled, X_dev_scaled, y_train, y_dev, "Random Forest")


Training Set Accuracy for Random Forest: 0.9831460674157303
Training Set Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       333
           1       1.00      0.96      0.98       201

    accuracy                           0.98       534
   macro avg       0.99      0.98      0.98       534
weighted avg       0.98      0.98      0.98       534

Development Set Accuracy for Random Forest: 0.7808988764044944
Development Set Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       113
           1       0.72      0.66      0.69        65

    accuracy                           0.78       178
   macro avg       0.77      0.76      0.76       178
weighted avg       0.78      0.78      0.78       178



In [348]:
# Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
train_and_evaluate(gb_model, X_train_scaled, X_dev_scaled, y_train, y_dev, "Gradient Boosting")


Training Set Accuracy for Gradient Boosting: 0.9044943820224719
Training Set Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       333
           1       0.96      0.78      0.86       201

    accuracy                           0.90       534
   macro avg       0.92      0.88      0.89       534
weighted avg       0.91      0.90      0.90       534

Development Set Accuracy for Gradient Boosting: 0.8089887640449438
Development Set Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       113
           1       0.78      0.66      0.72        65

    accuracy                           0.81       178
   macro avg       0.80      0.78      0.79       178
weighted avg       0.81      0.81      0.81       178



In [349]:
# Perform hyperparameter tuning using GridSearchCV

param_grid = {
    'max_depth': [5,10,15,20],
    'min_samples_leaf': [4,5,6,10],
    'n_estimators': [50,100,105],
}
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Retrieve the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)


Best parameters found:  {'max_depth': 10, 'min_samples_leaf': 5, 'n_estimators': 100}


In [350]:
# fitting the parameters to check the best tuning for Random Forest
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

# Predict on the Training set
y_train_pred_best = gb_model.predict(X_train_scaled)
y_dev_pred_best = gb_model.predict(X_dev_scaled)

# Evaluate the model on the development set
train_accuracy_best = accuracy_score(y_train, y_train_pred_best)
train_report_best = classification_report(y_train, y_train_pred_best)

dev_accuracy_best = accuracy_score(y_dev, y_dev_pred_best)
dev_report_best = classification_report(y_dev, y_dev_pred_best)

print(f"Training Set Accuracy for Radnom Forest best tuning: {train_accuracy_best}")
print(f"Training Set Classification Report for Radnom Forest best tuning:")
print(train_report_best)

print(f"Development Set Accuracy for Radnom Forest best tuning: {dev_accuracy_best}")
print(f"Development Set Classification Report for Radnom Forest best tuning")
print(dev_report_best)

Training Set Accuracy for Radnom Forest best tuning: 0.9044943820224719
Training Set Classification Report for Radnom Forest best tuning:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       333
           1       0.96      0.78      0.86       201

    accuracy                           0.90       534
   macro avg       0.92      0.88      0.89       534
weighted avg       0.91      0.90      0.90       534

Development Set Accuracy for Radnom Forest best tuning: 0.8089887640449438
Development Set Classification Report for Radnom Forest best tuning
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       113
           1       0.78      0.66      0.72        65

    accuracy                           0.81       178
   macro avg       0.80      0.78      0.79       178
weighted avg       0.81      0.81      0.81       178



From the results we can summarise that the best model is Gradient Boosting and Radnom Forest with best tuning as they are having the hiest score of the Development data accuracy