In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")

In [3]:
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})

In [4]:
train['Fare'] = np.log1p(train['Fare'])


In [5]:
import re
def add_title_column(df):
    """
    Cleans the 'Name' column and adds a 'Title' column directly 
    to the input DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to modify in-place.
    """
    # 1. Clean the Name column (remove text in parentheses and quotes)
    # This series is temporary and will be used to build the 'Title'
    cleaned_names = df['Name'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())
    cleaned_names = cleaned_names.str.replace(r'"[^"]*"', '', regex=True).str.strip()

    # 2. Extract the part of the name after the comma
    name_part = cleaned_names.str.split(',').str.get(1)

    # 3. Extract the Title from the remaining part of the name
    extracted_title = name_part.str.split('.').str.get(0).str.strip()
    
    # 4. Standardize the common titles
    title_mapping = {
        'Mlle': 'Miss',
        'Ms': 'Miss',
        'Mme': 'Mrs'
    }
    extracted_title = extracted_title.replace(title_mapping)

    # 5. Define a list of common titles
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']

    # 6. Create the 'Title' column directly on the DataFrame
    # Categorize any title not in common_titles as 'Rare'
    df['Titel'] = extracted_title.apply(lambda x: x if x in common_titles else 'Rare')
    df.drop('Name', axis=1, inplace=True)

In [6]:
add_title_column(train)

In [7]:
y = train['Survived']
X_train = train.drop(['Survived','Cabin','Embarked', 'Ticket'], axis=1)

In [8]:
import itertools
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [9]:
class GroupedMedianImputer(BaseEstimator, TransformerMixin):
    """
    A custom transformer to impute missing 'Age' values based on the
    median of specified groups.
    """
    def __init__(self, group_cols):
        # Initialize with the column(s) to group by (e.g., ['Titel', 'Pclass'])
        self.group_cols = group_cols
        # Dictionary to store the median 'Age' for each group
        self.medians = {}
        # A fallback median for the entire dataset
        self.global_median = 0

    def fit(self, X, y=None):
        """Learns the median 'Age' from the training data for each group."""
        X_ = X.copy()
        
        # Calculate and store the median 'Age' for each group
        if self.group_cols:
            self.medians = X_.groupby(self.group_cols)['Age'].median()
        
        # Calculate and store the overall median 'Age' as a fallback
        self.global_median = X_['Age'].median()
        
        # This is standard practice in scikit-learn
        return self

    def transform(self, X):
        """Applies the learned medians to fill missing 'Age' values."""
        X_ = X.copy()
        
        # Impute 'Age' using the median of each group
        if self.group_cols:
            X_['Age'] = X_.groupby(self.group_cols)['Age'].transform(lambda x: x.fillna(x.median()))
        
        # Fill any remaining missing 'Age' values with the global median
        # (This handles groups in the test set that weren't in the train set)
        X_['Age'] = X_['Age'].fillna(self.global_median)
        
        # Drop non-numeric columns to ensure the output is ready for a model
        X_ = X_.select_dtypes(include=np.number)
        
        return X_

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train['Titel'] = le.fit_transform(X_train['Titel'])

In [11]:
# 2. Setup
# a. Define columns to process
columns_to_process = ['Age']

# b. Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# c. Define evaluation models
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
    "Ridge Classifier": RidgeClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

scoring_metrics = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted', 
    'recall': 'recall_weighted',
    'f1_score': 'f1_weighted'
}

# b. Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# d. Define techniques dictionary
techniques = {}
grouping_features = ['Pclass', 'Titel'] 
for i in range(1, len(grouping_features) + 1):
    for combo in itertools.combinations(grouping_features, i):
        combo_list = list(combo)
        name = f"Median Impute by {'_&_'.join(combo_list)}"
        techniques[name] = GroupedMedianImputer(group_cols=combo_list)


In [12]:
results_train = pd.DataFrame()
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---\n")
    results = {}
    

    
    # b. Baseline Evaluation
    # For a fair baseline, we'll impute 'Age' with the global median
    train_baseline = X_train.copy()
    train_baseline['Age'] = train_baseline['Age'].fillna(X_train['Age'].median())
    baseline_scores = cross_validate(
        model, 
        X=train_baseline.drop('Titel', axis=1), 
        y= y, 
        cv=cv, 
        scoring=scoring_metrics, 
        return_train_score=True
    )
    
    results['Baseline'] = {
        'Train accuracy Mean': baseline_scores['train_accuracy'].mean(),
        'CV accuracy Mean': baseline_scores['test_accuracy'].mean(),
        'train_f1_Mean': baseline_scores['train_f1_score'].mean(),
        'CV_f1_Mean': baseline_scores['test_f1_score'].mean()
    }

 
    for name, imputer in techniques.items():
        # The imputer is the first step
        pipeline = Pipeline(steps=[
            ('imputer', imputer),
            ('model', model)
        ])
        
        # Perform cross-validation on the pipeline
        # The full train is passed to the pipeline
        scores = cross_validate(pipeline, X_train, y, cv=cv, scoring=scoring_metrics, return_train_score=True)
        
        results[name] = {
        'Train accuracy Mean': scores['train_accuracy'].mean(),
        'CV accuracy Mean': scores['test_accuracy'].mean(),
        'train_f1_Mean': scores['train_f1_score'].mean(),
        'CV_f1_Mean': scores['test_f1_score'].mean()
        }

    # d. Feature Dropped Evaluation*
    train_dropped = X_train.drop(columns=columns_to_process)
    dropped_scores = cross_validate(model, train_dropped, y, cv=cv, scoring=scoring_metrics, return_train_score=True)
    results['Feature Dropped'] = {
        'Train accuracy Mean': dropped_scores['train_accuracy'].mean(),
        'CV accuracy Mean': dropped_scores['test_accuracy'].mean(),
        'train_f1_Mean': dropped_scores['train_f1_score'].mean(),
        'CV_f1_Mean': dropped_scores['test_f1_score'].mean()
        }
    
    model_results_train = pd.DataFrame.from_dict(results, orient='index')
    model_results_train.insert(loc=0, column='Model', value = 'a')
    model_results_train['Model'] = model_name
    results_train = pd.concat([results_train, model_results_train])
    results_train['Overfitting Score'] = results_train['train_f1_Mean'] - results_train['CV_f1_Mean']
    results_train['Generalization Ratio'] = results_train['CV_f1_Mean'] / results_train['train_f1_Mean']
    results_train = results_train.sort_values(by='CV_f1_Mean', ascending=False)

--- Evaluating Model: Logistic Regression ---

--- Evaluating Model: Ridge Classifier ---

--- Evaluating Model: AdaBoost ---



In [13]:
results_train

Unnamed: 0,Model,Train accuracy Mean,CV accuracy Mean,train_f1_Mean,CV_f1_Mean,Overfitting Score,Generalization Ratio
Median Impute by Pclass_&_Titel,AdaBoost,0.828843,0.820438,0.827523,0.818651,0.008872,0.989278
Median Impute by Titel,AdaBoost,0.82716,0.818191,0.825635,0.816315,0.00932,0.988712
Median Impute by Pclass,AdaBoost,0.824916,0.815944,0.823637,0.815027,0.00861,0.989546
Feature Dropped,AdaBoost,0.82772,0.808085,0.826484,0.806241,0.020243,0.975507
Baseline,AdaBoost,0.814534,0.805831,0.813776,0.804658,0.009117,0.988796
Median Impute by Pclass,Logistic Regression,0.807237,0.799084,0.805415,0.797176,0.008239,0.98977
Median Impute by Pclass,Ridge Classifier,0.801627,0.796874,0.799391,0.79472,0.004672,0.994156
Feature Dropped,Ridge Classifier,0.797138,0.796887,0.794723,0.794236,0.000487,0.999388
Median Impute by Pclass_&_Titel,Ridge Classifier,0.800786,0.795744,0.798787,0.793641,0.005146,0.993558
Median Impute by Pclass_&_Titel,Logistic Regression,0.806957,0.795706,0.804929,0.793166,0.011763,0.985386


NameError: name 'X_' is not defined