In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from category_encoders import MEstimateEncoder
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
filepath = "/Users/balqeesjabri/Downloads/titanic.csv"
titanic_data = pd.read_csv(filepath)

In [2]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Assuming 'preds' contains the predicted labels for the validation set, and 'y_valid' contains the true labels
def evaluate_classification_model(y_valid, preds):
    # Calculate precision, recall, and F1-score
    precision = precision_score(y_valid, preds)
    recall = recall_score(y_valid, preds)
    f1 = f1_score(y_valid, preds)
    
    # Calculate the confusion matrix
    conf_matrix = confusion_matrix(y_valid, preds)
    
    # Print the results
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("Confusion Matrix:")
    print(conf_matrix)




In [3]:
titanic_data.select_dtypes(["object"]).nunique()

Name        891
Sex           2
Ticket      681
Cabin       147
Embarked      3
dtype: int64

In [4]:
# we will drop the name column since this feature does not add any meaningful information and is unique for each sample,
# it is unlikely to help your model make accurate predictions.
titanic_data = titanic_data.drop(columns=['Name', 'PassengerId'])

In [5]:
categorical_cols = [colname for colname in titanic_data.columns if titanic_data[colname].dtype == "object"]
print(categorical_cols)

['Sex', 'Ticket', 'Cabin', 'Embarked']


In [6]:
titanic_data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [7]:
titanic_data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [8]:
from sklearn.model_selection import train_test_split

X = titanic_data.drop(columns="Survived")
y = titanic_data["Survived"]

# Split the data into training and encoding sets
X_train_full, X_encode_full, y_train, y_encode = train_test_split(X, y, random_state=0)


In [9]:
from category_encoders import MEstimateEncoder

# Choose a set of features to encode and a value for m
encoder = MEstimateEncoder(cols=categorical_cols, m=5.0)

# Fit the encoder on the encoding set
encoder.fit(X_encode_full, y_encode)

# Encode the encoding set
X_encoded = encoder.transform(X_encode_full, y_encode)


In [10]:
# Encode the training set using the already fitted encoder
X_train_encoded = encoder.transform(X_train_full, y_train)


In [11]:
# filter categorical cols only
categorical_features = [colname for colname in X_train_full.columns if X_train_full[colname].dtype == "object"]

# Select numerical columns
numerical_features = [colname for colname in X_train_full.columns if X_train_full[colname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_features + numerical_features
X_train = X_train_full[my_cols].copy()
X_valid = X_encode_full[my_cols].copy()

In [12]:
# import library
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
# fit predictor and target variablex_ros, y_ros = ros.fit_resample(x, y)
X_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({0: 549, 1: 342})
Resample dataset shape Counter({0: 549, 1: 549})


In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from imblearn.under_sampling import TomekLinks
from xgboost import XGBClassifier
# Assuming you have defined your numerical_cols and categorical_cols appropriately

# Step 1: Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Step 2: Create a ColumnTransformer to apply different transformers to different feature types
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])



# define model 
my_model = XGBClassifier(n_estimators=900, learning_rate=0.07)

# Step 5: Create the pipeline with TomekLinks undersampling and the model
my_pipeline = Pipeline(steps=[
  
    ('preprocessor', preprocessor),
    ('model', my_model)
])


In [14]:
# This line fits the pipeline to the training data,
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

In [16]:
evaluate_classification_model(y_encode, preds)

Precision: 0.7752808988764045
Recall: 0.8214285714285714
F1-score: 0.7976878612716762
Confusion Matrix:
[[119  20]
 [ 15  69]]
