In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import MEstimateEncoder
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
filepath = "/Users/balqeesjabri/Downloads/titanic.csv"
titanic_data = pd.read_csv(filepath)

In [2]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

def evaluate_classification_model(y_valid, preds):
    # Calculate precision, recall, and F1-score
    precision = precision_score(y_valid, preds)
    recall = recall_score(y_valid, preds)
    f1 = f1_score(y_valid, preds)
    roc_auc = roc_auc_score(y_valid, preds)
    
    # Calculate the confusion matrix
    conf_matrix = confusion_matrix(y_valid, preds)
    
    # Print the results
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print('ROC', roc_auc)
    print("Confusion Matrix:")
    print(conf_matrix)




In [3]:
titanic_data.select_dtypes(["object"]).nunique()

Name        891
Sex           2
Ticket      681
Cabin       147
Embarked      3
dtype: int64

In [4]:
# we will drop the name column since these features does not add any meaningful information and is unique for each sample,
# it is unlikely to help my model make accurate predictions.
titanic_data = titanic_data.drop(columns=['Name', 'PassengerId'])

In [5]:
categorical_cols = [colname for colname in titanic_data.columns if titanic_data[colname].dtype == "object"]
print(categorical_cols)

['Sex', 'Ticket', 'Cabin', 'Embarked']


In [6]:
from sklearn.model_selection import train_test_split
y = titanic_data.Survived
X = titanic_data.drop(['Survived'], axis=1)

# Split the data into training and encoding sets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, random_state=0)


In [7]:
# filter categorical cols only
categorical_features = [colname for colname in X_train_full.columns if X_train_full[colname].dtype == "object"]

# Select numerical columns
numerical_features = [colname for colname in X_train_full.columns if X_train_full[colname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_features + numerical_features
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler


#  Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoderOrdinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    # ('encoder', MEstimateEncoder(m=5.0))  # Use MEstimateEncoder here
])

#  Create a ColumnTransformer to apply different transformers to different feature types
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])





In [9]:
# Define the model
my_model = XGBClassifier(n_estimators=850, learning_rate=0.06)

# Create the pipeline
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', my_model)
])

# # Data balancing using RandomOverSampler
# ros = RandomOverSampler(random_state=0)
# X_ros, y_ros = ros.fit_resample(X_train, y_train)

#  Fit the pipeline to the balanced training data
my_pipeline.fit(X_train, y_train)

In [10]:
from sklearn.model_selection import cross_val_predict
preds_cv = cross_val_predict(my_pipeline, X, y, cv=5)

evaluate_classification_model(y, preds_cv)

Precision: 0.7002583979328165
Recall: 0.7923976608187134
F1-score: 0.7434842249657064
ROC 0.7905522001725626
Confusion Matrix:
[[433 116]
 [ 71 271]]


target enccoding only:
Precision: 0.685459940652819
Recall: 0.6754385964912281
F1-score: 0.6804123711340206
Confusion Matrix:
[[443 106]
 [111 231]]

Target encoding with ros:
Precision: 0.6744186046511628
Recall: 0.6783625730994152
F1-score: 0.6763848396501456
Confusion Matrix:
[[437 112]
 [110 232]]

Ordinal encoding with ros:
Precision: 0.7002583979328165
Recall: 0.7923976608187134
F1-score: 0.7434842249657064
Confusion Matrix:
[[433 116]
 [ 71 271]]

Ordinal encoding only:
Precision: 0.7002583979328165
Recall: 0.7923976608187134
F1-score: 0.7434842249657064
ROC 0.7905522001725626
Confusion Matrix:
[[433 116]
 [ 71 271]]

SINCE THE SCORES WITH DATASET BALANCING AND WITHOUT ARE THE SAME I WILL NOT BE BALANCING IT 

In [11]:
from sklearn.metrics import classification_report
classification_report(y, preds_cv)

'              precision    recall  f1-score   support\n\n           0       0.86      0.79      0.82       549\n           1       0.70      0.79      0.74       342\n\n    accuracy                           0.79       891\n   macro avg       0.78      0.79      0.78       891\nweighted avg       0.80      0.79      0.79       891\n'