In [11]:
# Oversampling using SMOTE

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from imblearn.over_sampling import SMOTE

In [12]:
df = pd.read_csv('Datasets/titanic.csv')
df.head(2)

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C


In [13]:
df = df[['survived', 'pclass', 'gender', 'age', 'sibsp', 'fare']].dropna()

In [14]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])

In [15]:
X = df.drop('survived', axis=1)
y = df['survived']

In [16]:
# Check class distribution before SMOTE

print("Class distribution before SMOTE:")
print(y.value_counts())

Class distribution before SMOTE:
survived
0    424
1    290
Name: count, dtype: int64


In [17]:
# Split data
# stratify = y ... maintains class distribution in training and test sets
# e.g., If y (survived) has 60% as class 0 and 40% as class 1, then:
# With stratify = y, both y_train and y_test will maintain that 60-40 ratio
# Without stratify = y, the split could be random (unbalanced)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

In [18]:
# ------------------------ Before SMOTE ------------------------

model = LogisticRegression(max_iter = 500)
model.fit(X_train, y_train)
y_pred_before = model.predict(X_test)
print("\nBefore SMOTE:")
print("Accuracy:", accuracy_score(y_test, y_pred_before))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_before))


Before SMOTE:
Accuracy: 0.7953488372093023
Confusion Matrix:
 [[106  22]
 [ 22  65]]


In [19]:
# --------------------------- Apply SMOTE ------------------------------

smote = SMOTE(random_state = 42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [20]:
# Check class distribution after SMOTE

print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_res).value_counts())


Class distribution after SMOTE:
survived
0    296
1    296
Name: count, dtype: int64


In [21]:
# ---------------------------- After SMOTE ----------------------------------

model_smote = LogisticRegression(max_iter = 500)
model_smote.fit(X_train_res, y_train_res)
y_pred_after = model_smote.predict(X_test)
print("\nAfter SMOTE:")
print("Accuracy:", accuracy_score(y_test, y_pred_after))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_after))


After SMOTE:
Accuracy: 0.7813953488372093
Confusion Matrix:
 [[101  27]
 [ 20  67]]
