In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
!pip install scikit-learn==1.5.2 imbalanced-learn==0.12.3


In [None]:
data=pd.read_csv('/kaggle/input/titanic/titanic.csv')
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

the features types are correct

In [None]:
print(data['Sex'].value_counts(normalize=True))
print(data['Ticket'].value_counts(normalize=True))
print(data['Cabin'].value_counts(normalize=True))
print(data['Embarked'].value_counts(normalize=True))
print(data['Survived'].value_counts(normalize=True))
print(data['Parch'].value_counts(normalize=True))
print(data['SibSp'].value_counts(normalize=True))
print(data['Pclass'].value_counts(normalize=True))


no typos and no rare values but there is useless features like **PassengerId** **name** **tickets** which is too unique 681 for 981 passenger and lastly **cabin**

notice that the target is **imbalanced**

Categorical: Sex, Embarked, Pclass (treated as categorical even if numeric).

Numerical: Age, Fare, SibSp (siblings/spouses), Parch (parents/children).

In [None]:
data.duplicated().sum()

In [None]:
data.isnull().sum()

In [None]:
(data['Cabin'].isnull().sum())/len(data)

note that 77% of the data is missing so it's hard to impute NANs

droping the unnecessary features

In [None]:
data=data.drop(['Name','Cabin','Ticket','PassengerId'],axis=1)


In [None]:
data

to impute the missing values of the age and embarked we need first to handle outliers for numerical features then encoding the categorical features and then impute the NANs

In [None]:
data[['SibSp','Pclass','Parch']]=data[['SibSp','Pclass','Parch']].astype('category')

In [None]:
X=data.drop('Survived',axis=1)
y=data['Survived']

In [None]:
numerical_features=X.select_dtypes(include=['int','float']).columns

In [None]:
numerical_features

In [None]:
categorical_features=X.select_dtypes(include=["object",'category']).columns

In [None]:
categorical_features

In [None]:
object_features=X.select_dtypes(include="object").columns
object_features

In [None]:
plt.figure(figsize=(len(numerical_features)*4,10))
for i,feature in enumerate(numerical_features,start=1):
    plt.subplot(len(numerical_features),2,i)
    sns.histplot(X[feature])
    plt.title(f'distribution of {feature}')
plt.tight_layout()
plt.show()


we will try to normalize it to use zscore in outliers handling

In [None]:
from scipy.stats import boxcox,normaltest,zscore

In [None]:
for feature in numerical_features:
    stat,p=normaltest(X[feature].dropna())
    print(f"p of {feature}={p}----->{'Normal'if p>0.05 else 'Not Normal'}")

In [None]:
for feature in numerical_features:
    data_log=np.log1p(X[feature])
    stat,p=normaltest(data_log.dropna())
    print(f"p of {feature}={p}----->{'Normal'if p>0.05 else 'Not Normal'}")

In [None]:
for feature in numerical_features:
    data_sqr=np.log1p(X[feature])
    stat,p=normaltest(data_sqr.dropna())
    print(f"p of {feature}={p}----->{'Normal'if p>0.05 else 'Not Normal'}")

In [None]:
for feature in numerical_features:
    if data[feature].min()<=0:
        pos_data=data[feature]-data[feature].min()+1
    else:
        pos_data=data[feature]
    transformed,lamda=boxcox(pos_data.dropna())
    stat,p=normaltest(transformed)
    print(f"p of {feature}={p}----->{'Normal'if p>0.05 else 'Not Normal'}")

we succeed to normalize the age

In [None]:
Age_copy=X['Age'].copy()
age_nonnull=Age_copy.dropna()
age_transformed,lam1=boxcox(age_nonnull.dropna())
Age_copy.loc[age_nonnull.index] = age_transformed

In [None]:
z_scores = pd.Series(zscore(Age_copy.dropna()), index=Age_copy.dropna().index)

mask = (np.abs(z_scores) <= 3).reindex(Age_copy.index, fill_value=True)

cleaned_data = X.loc[mask]

In [None]:
cleaned_data

In [None]:
cleaned_data.shape

only one row dropped

In [None]:
plt.boxplot(X['Fare'])
plt.show()

In [None]:
fare_copy=X['Fare'].copy()
min_value=fare_copy.min()
fare_copy=fare_copy-min_value+1
transformed,lam2=boxcox(fare_copy)
plt.boxplot(transformed)
plt.show()


so much better

In [None]:
Q1=fare_copy.quantile(0.25)
Q3=fare_copy.quantile(0.75)
IQR=Q3-Q1
lower_bound=Q1-1.5*IQR
upper_bound=Q3+1.5*IQR
cleaned_data=cleaned_data[(fare_copy>=lower_bound)&(fare_copy<=upper_bound)]
cleaned_data.shape

In [None]:
cleaned_data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler

In [None]:
ct=ColumnTransformer([
    ('one hot',OneHotEncoder(),['Embarked','Sex'])],
    remainder='passthrough'
)

I didn't drop first category because this is a preprocess for KNN which is distance based and if I droped first I will be droping a dimension which will miscalculate the distance and so the imputation

In [None]:
encoded_data = pd.DataFrame(
    ct.fit_transform(cleaned_data),
    columns=ct.get_feature_names_out(),
    index=cleaned_data.index
)
encoded_data


In [None]:
scaler=MinMaxScaler()
scaled_data=pd.DataFrame(scaler.fit_transform(encoded_data),columns=encoded_data.columns,index=encoded_data.index)
scaled_data

now we are ready to impute the missing values

In [None]:
from sklearn.impute import KNNImputer

In [None]:
imputer=KNNImputer(n_neighbors=5)
imputed_data=imputer.fit_transform(scaled_data)
imputed_data=pd.DataFrame(imputed_data,columns=scaled_data.columns,index=scaled_data.index)
imputed_data

In [None]:
original_scale=pd.DataFrame(scaler.inverse_transform(imputed_data),columns=imputed_data.columns,index=imputed_data.index)
original_scale

In [None]:
original_scale.isnull().sum()

In [None]:
df=pd.merge(original_scale,y,how='inner',left_index=True,right_index=True)
plt.figure(figsize=(16,12))
correlation_matrix=df.corr()
sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm')
plt.show()

I don't have to reduce the multicolinearity because we are going to use decision tree model and decision trees don't assume liearity between the target and the features so there is no coefficients only thresholds to split the data

so I will leave all features and then drop the features with less importance after training the tree

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]
X_lr=X.copy()

In [None]:
X.columns

we will use logistic regression as baseline to refer to it

I will handle the multicolinearity and the variance manually without PCA for interpretability

In [None]:
X_lr.drop('one hot__Sex_female',axis=1,inplace=True)

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
X_lr.shape

In [None]:
X_lr.columns

In [None]:
selector=VarianceThreshold(threshold=0.05)
selected=pd.DataFrame(selector.fit_transform(X_lr),columns=X_lr.columns[selector.get_support()])
selected

In [None]:
selected.columns

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
preprocessor=ColumnTransformer(transformers=[('pipe',Pipeline(steps=[('yeo johnson',PowerTransformer(method='yeo-johnson')),('std',StandardScaler())]),['remainder__Pclass',
    'remainder__Age','remainder__SibSp','remainder__Parch', 'remainder__Fare'])],remainder='passthrough')

normalizing and scaling the numerical and the multicategorical features only not for **binary features** because it has no sense

In [None]:
lr_pipe=Pipeline(steps=[('preprocess',preprocessor),
                        ('balancer',SMOTE(random_state=42)),
                        ('classifier',LogisticRegression())])

we have to scale and normalize the data because it's preprocessing for logistic regression which assume linearity between features and target and scaling to prevent domination of features with large scale on others

use yeo jonhnson for negative features

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [None]:
x_train_lr=X_train.drop('one hot__Sex_female',axis=1)
x_test_lr=X_test.drop('one hot__Sex_female',axis=1)

In [None]:
lr_pipe.fit(x_train_lr,y_train)
y_pred_lr=lr_pipe.predict(x_test_lr)
y_proba_lr=lr_pipe.predict_proba(x_test_lr)

In [None]:
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix,ConfusionMatrixDisplay, precision_score,recall_score,f1_score,roc_auc_score

In [None]:
plt.figure(figsize=(5,5))
cm=confusion_matrix(y_true=y_test,y_pred=y_pred_lr)
disp=ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=['not survived','survived'])
disp.plot(cmap='Blues')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
balancer=SMOTE(random_state=42)
X_train_resampled,y_train_resampled=balancer.fit_resample(X_train,y_train)
dt=DecisionTreeClassifier(random_state=42)
dt.fit(X_train_resampled,y_train_resampled)
y_pred_dt=dt.predict(X_test)
y_proba_dt=dt.predict_proba(X_test)
y_train_pred_dt=dt.predict(X_train)

In [None]:
plt.figure(figsize=(5,5))
cm=confusion_matrix(y_true=y_test,y_pred=y_pred_dt)
disp=ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=['not survived','survived'])
disp.plot(cmap='Blues')
plt.show()

In [None]:
print(accuracy_score(y_test,y_pred_dt))
print(classification_report(y_test,y_pred_dt,target_names=['Not Survive','Survive']))
print(f'accuracy of training: {accuracy_score(y_train,y_train_pred_dt)}')
print(classification_report(y_train,y_train_pred_dt))

In [None]:
importances=dt.feature_importances_
feature_importance=pd.DataFrame({'features':X_train.columns,
                                 'importances':importances}).sort_values(by='importances',ascending=False)
feature_importance

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['features'], feature_importance['importances'])

plt.xlabel("Importance")
plt.title("Feature Importance from Decision Tree")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

as we can see the model automatically didn't use one of the high correlated pair of **gender**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from xgboost import XGBClassifier

In [None]:
rf=RandomForestClassifier(random_state=42,max_depth=11,n_estimators=200)
rf.fit(X_train_resampled,y_train_resampled)
y_pred_rf=rf.predict(X_test)
y_proba_rf=rf.predict_proba(X_test)
y_train_pred_rf=rf.predict(X_train)

In [None]:
print(f'accuracy of test: {accuracy_score(y_test,y_pred_rf)}')
print(classification_report(y_test,y_pred_rf,target_names=['Not Survive','Survive']))
print(f'accuracy of training: {accuracy_score(y_train,y_train_pred_rf)}')
print(classification_report(y_train,y_train_pred_rf))

In [None]:
importances=rf.feature_importances_
feature_importance=pd.DataFrame({'features':X_train.columns,
                                 'importances':importances}).sort_values(by='importances',ascending=False)
feature_importance

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['features'], feature_importance['importances'])

plt.xlabel("Importance")
plt.title("Feature Importance from Random Forest")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
xgb=XGBClassifier(random_state=42)
xgb.fit(X_train_resampled,y_train_resampled)
y_pred_xgb=xgb.predict(X_test)
y_proba_xgb=xgb.predict_proba(X_test)
y_train_pred_xgb=xgb.predict(X_train)

In [None]:
print(f'accuracy of test: {accuracy_score(y_test,y_pred_xgb)}')
print(classification_report(y_test,y_pred_xgb,target_names=['Not Survive','Survive']))
print(f'accuracy of training: {accuracy_score(y_train,y_train_pred_xgb)}')
print(classification_report(y_train,y_train_pred_xgb))

all the tree models have overfitted

In [None]:
importances=xgb.feature_importances_
feature_importance=pd.DataFrame({'features':X_train.columns,
                                 'importances':importances}).sort_values(by='importances',ascending=False)
feature_importance

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['features'], feature_importance['importances'])

plt.xlabel("Importance")
plt.title("Feature Importance from XGBoosting")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

note that if I removed the less important features it won't change or it will slightly change the accuracy

## Evaluation

In [None]:
models_names=['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','XGboost']
predictions=[y_pred_lr,y_pred_dt,y_pred_rf,y_pred_xgb]
probas=[y_proba_lr,y_proba_dt,y_proba_rf,y_proba_xgb]
names_predictions_probas=list(zip(models_names,predictions,probas))

In [None]:
results = []

for name, prediction, proba in names_predictions_probas:
    results.append({
        'model': name,
        'accuracy': accuracy_score(y_test, prediction),
        'precision': precision_score(y_test, prediction),
        'recall': recall_score(y_test, prediction),
        'roc auc': roc_auc_score(y_test, proba[:, 1]),
        'f1': f1_score(y_test, prediction)
    })

results_df = pd.DataFrame(results).set_index('model').sort_values(by='f1', ascending=False)

display(results_df)


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

for model, _, probability in names_predictions_probas:
    # handle probability shape
    if probability.ndim > 1 and probability.shape[1] > 1:
        proba = probability[:, 1]  # positive class
    else:
        proba = probability.ravel()  # already single column

    precision, recall, _ = precision_recall_curve(y_test, proba)
    plt.plot(recall, precision, label=model)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

plt.figure(figsize=(7, 7))

for model, _, probability in names_predictions_probas:
    # handle probability shape safely
    if probability.ndim == 1:
        proba = probability
    elif probability.shape[1] == 1:
        proba = probability.ravel()
    else:
        proba = probability[:, 1]   # positive class

    fpr, tpr, _ = roc_curve(y_test, proba)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, label=f"{model} (AUC = {roc_auc:.3f})")

# diagonal line
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.grid(True)
plt.show()


I didn't consider the roc auc as a good metric or criterion in the case of misbalancing even after using smote because the test set still has minority and in case of minority the true positive rate will always be higher than false negative rate which is the minority

instead we can take the **f1 score** as reference

important features for **logistic regression**

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(x_train_lr, y_train)
feature_importance = pd.DataFrame({
    "feature": x_train_lr.columns,
    "coefficient": lr.coef_[0],
    "importance": np.abs(lr.coef_[0])
}).sort_values(by="importance", ascending=False)

print(feature_importance)


In [None]:
plt.figure(figsize=(8, 5))
plt.barh(feature_importance["feature"], feature_importance["importance"])
plt.xlabel("Importance (|coefficient|)")
plt.ylabel("Feature")
plt.title("Logistic Regression Feature Importance")
plt.gca().invert_yaxis()
plt.show()


now we want to modify tree models to reduce overfitting

we will tune hyperparameters like **max depth**/**min_child_weight**/**gamma**

In [None]:
print("Max depth:", dt.tree_.max_depth)
print("Number of leaves:", dt.tree_.n_leaves)

good to know the depth of the tree and the total number of leaves to modify them later

In [None]:
dt2=DecisionTreeClassifier(random_state=42,max_depth=8,min_samples_split=2,min_samples_leaf=5,max_leaf_nodes=10)
dt2.fit(X_train_resampled,y_train_resampled)
y_pred2_dt=dt2.predict(X_test)
y_proba2_dt=dt2.predict_proba(X_test)
y_train_pred2_dt=dt2.predict(X_train)

decreasing the depth can reduce overfitting

lower min_samples_split can lead to a deeper tree and it can cause overfitting

small min_samples_leaf makes the leaf get a few samples and it can memorize them and then increading the variance and so the overfitting

max_leaf_node is unlimited by default but we can control it to reduce the complexity

we always want the more **general** model so these hyperparameters can reduce so much the training metrics but decrease the variance between the train and test sets so it's trade off

In [None]:
print(accuracy_score(y_test,y_pred2_dt))
print(classification_report(y_test,y_pred2_dt,target_names=['Not Survive','Survive']))
print(f'accuracy of training: {accuracy_score(y_train,y_train_pred2_dt)}')
print(classification_report(y_train,y_train_pred2_dt))

In [None]:
rf2=RandomForestClassifier(random_state=42,max_depth=8,n_estimators=300,min_samples_leaf=4,)
rf2.fit(X_train_resampled,y_train_resampled)
y_pred2_rf=rf2.predict(X_test)
y_proba2_rf=rf2.predict_proba(X_test)
y_train_pred2_rf=rf2.predict(X_train)

increasing the number of estimators (parallel trees) can help increasing the generelization

too much estimators can have only a bad effect by slowing the model learning

In [None]:
print(f'accuracy of test: {accuracy_score(y_test,y_pred2_rf)}')
print(classification_report(y_test,y_pred2_rf,target_names=['Not Survive','Survive']))
print(f'accuracy of training: {accuracy_score(y_train,y_train_pred2_rf)}')
print(classification_report(y_train,y_train_pred2_rf))

In [None]:
xgb2=XGBClassifier(random_state=42,max_depth=1,subsample=0.3,min_child_weight=3)
xgb2.fit(X_train_resampled,y_train_resampled)
y_pred2_xgb=xgb2.predict(X_test)
y_proba2_xgb=xgb2.predict_proba(X_test)
y_train_pred2_xgb=xgb2.predict(X_train)

min_child_weight is like min_samples_leaf the smaller weight equals more complexity

In [None]:
print(f'accuracy of test: {accuracy_score(y_test,y_pred2_xgb)}')
print(classification_report(y_test,y_pred2_xgb,target_names=['Not Survive','Survive']))
print(f'accuracy of training: {accuracy_score(y_train,y_train_pred2_xgb)}')
print(classification_report(y_train,y_train_pred2_xgb))

In [None]:
predictions2=[y_pred_lr,y_pred2_dt,y_pred2_rf,y_pred2_xgb]
names_predictions2_probas=list(zip(models_names,predictions2,probas))

In [None]:
results = []

for name, prediction, proba in names_predictions2_probas:
    results.append({
        'model': name,
        'accuracy': accuracy_score(y_test, prediction),
        'precision': precision_score(y_test, prediction),
        'recall': recall_score(y_test, prediction),
        'roc auc': roc_auc_score(y_test, proba[:, 1]),
        'f1': f1_score(y_test, prediction)
    })

results_df = pd.DataFrame(results).set_index('model').sort_values(by='f1', ascending=False)

display(results_df)
