In [2]:
# Resampling using bootstrapping and k-fold cross validation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

In [3]:
df = pd.read_csv('Datasets/titanic.csv')

In [4]:
df.head(2)

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C


In [5]:
df = df[['survived', 'pclass', 'gender', 'age', 'sibsp', 'fare']].dropna()

In [8]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['gender']

0      1
1      0
2      0
3      0
4      1
      ..
885    0
886    1
887    0
889    1
890    1
Name: gender, Length: 714, dtype: int64

In [10]:
X = df.drop('survived', axis = 1)
y = df['survived']

In [13]:
# Bootstrapping

print("\n--- Bootstrapping Example ---")
n_iterations = 100 # Number of bootstrap samples (i.e., how many times to repeat)
n_size = int(len(X) * 0.80) # Each bootstrap sample size (e.g., 80% of original data)
bootstrap_scores = [] # To store accuracy of each bootstrap iteration

for i in range(n_iterations):

    # Resample with replacement
    # Each sample may include repeated rows from the original dataset

    X_resample, y_resample = resample(X, y, n_samples = n_size, random_state = 1)

    # Train/Test split

    X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size = 0.20, random_state = i)

    model = LogisticRegression(max_iter = 500)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    bootstrap_scores.append(acc)

print(f"Average accuracy over {n_iterations} bootstrapped samples: {np.mean(bootstrap_scores):.3f}")    


--- Bootstrapping Example ---
Average accuracy over 100 bootstrapped samples: 0.789


In [18]:
# K-Fold Cross-validation

print("\n--- K-Fold Cross-Validation ---")
kf = KFold(n_splits = 5, shuffle = True, random_state = 1) # 5 Folds
model = LogisticRegression(max_iter = 500)

# Perform K-Fold CV
cv_scores = cross_val_score(model, X, y, cv = kf, scoring = 'accuracy')

print(f'Accuracy scores for each fold: {cv_scores}')
print(f'Mean CV accuracy: {np.mean(cv_scores):.3f}')


--- K-Fold Cross-Validation ---
Accuracy scores for each fold: [0.81118881 0.7972028  0.78321678 0.7972028  0.83098592]
Mean CV accuracy: 0.804
