In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Load the classification dataset
train = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")

# Create feature matrix (X) and target vector (y)
X = train.select_dtypes(include=np.number).drop(['Survived', 'PassengerId'], axis=1)
y = train['Survived']

print(f"Missing values in 'Age': {X['Age'].isnull().sum()}")

Missing values in 'Age': 177


In [4]:
# a. Define the classification models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# b. Define the transformation pipelines to test
transformation_pipelines = {
    'Standard Scaler': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]),
    'MinMax Scaler': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])
}

# c. Define the classification scoring metrics
scoring_metrics = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_weighted',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted'
}

# d. Define the cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
# This DataFrame will hold all results for final comparison
all_results = pd.DataFrame()

# --- Main Loop ---
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---")
    model_results = {}

    # a. "Feature Dropped" Evaluation
    X_dropped = X.drop(columns=['Age'])
    feature_dropped_scores = cross_validate(
        model, X_dropped, y, cv=cv_strategy,
        scoring=scoring_metrics, return_train_score=True
    )
    model_results['Feature Dropped'] = {
        'Train F1-Score': feature_dropped_scores['train_f1_score'].mean(),
        'CV F1-Score': feature_dropped_scores['test_f1_score'].mean(),
        'CV Accuracy': feature_dropped_scores['test_accuracy'].mean()
    }

    # b. "Imputation Only" Baseline Evaluation
    baseline_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', model)
    ])
    baseline_scores = cross_validate(
        baseline_pipeline, X, y, cv=cv_strategy,
        scoring=scoring_metrics, return_train_score=True
    )
    model_results['Baseline (Imputation Only)'] = {
        'Train F1-Score': baseline_scores['train_f1_score'].mean(),
        'CV F1-Score': baseline_scores['test_f1_score'].mean(),
        'CV Accuracy': baseline_scores['test_accuracy'].mean()
    }

    # c. Transformation Pipelines Evaluation
    for tech_name, preprocessor in transformation_pipelines.items():
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(
            full_pipeline, X, y, cv=cv_strategy,
            scoring=scoring_metrics, return_train_score=True
        )
        model_results[tech_name] = {
            'Train F1-Score': scores['train_f1_score'].mean(),
            'CV F1-Score': scores['test_f1_score'].mean(),
            'CV Accuracy': scores['test_accuracy'].mean()
        }

    # d. Consolidate and store results
    temp_df = pd.DataFrame.from_dict(model_results, orient='index')
    temp_df['Model'] = model_name
    all_results = pd.concat([all_results, temp_df])

# e. Final processing for the results table
all_results.reset_index(inplace=True)
all_results.rename(columns={'index': 'Preprocessing Technique'}, inplace=True)
all_results['Generalization'] = all_results['CV F1-Score'] / all_results['Train F1-Score']
all_results = all_results.sort_values(by='CV F1-Score', ascending=False)

--- Evaluating Model: Logistic Regression ---
--- Evaluating Model: Random Forest ---


In [6]:
# Reorder columns for a more logical presentation
final_columns_order = [
    'Model',
    'Preprocessing Technique',
    'CV F1-Score',
    'CV Accuracy',
    'Train F1-Score',
    'Generalization'
]
all_results = all_results[final_columns_order]

all_results

Unnamed: 0,Model,Preprocessing Technique,CV F1-Score,CV Accuracy,Train F1-Score,Generalization
3,Logistic Regression,MinMax Scaler,0.690915,0.707024,0.692252,0.998068
7,Random Forest,MinMax Scaler,0.682298,0.68459,0.960595,0.710286
2,Logistic Regression,Standard Scaler,0.680763,0.698054,0.686705,0.991347
5,Random Forest,Baseline (Imputation Only),0.679824,0.682336,0.960595,0.707711
1,Logistic Regression,Baseline (Imputation Only),0.679347,0.696931,0.687389,0.9883
6,Random Forest,Standard Scaler,0.6764,0.678978,0.960595,0.704147
4,Random Forest,Feature Dropped,0.673472,0.678991,0.84542,0.796613
0,Logistic Regression,Feature Dropped,0.672074,0.686837,0.671734,1.000506
