In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")
test = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\test.csv")

In [3]:
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

In [4]:
train.drop('PassengerId', axis=1, inplace=True)
test.drop('PassengerId', axis=1, inplace=True)

In [5]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    int64  
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    int64  
 3   Age       332 non-nu

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(train[['Fare']])
test['Fare'] = imputer.transform(test[['Fare']])
train['Fare'] = np.log1p(train['Fare'])
test['Fare'] = np.log1p(test['Fare'])

In [7]:
import re
def add_title_column(df):
    """
    Cleans the 'Name' column and adds a 'Title' column directly 
    to the input DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to modify in-place.
    """
    # 1. Clean the Name column (remove text in parentheses and quotes)
    # This series is temporary and will be used to build the 'Title'
    cleaned_names = df['Name'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())
    cleaned_names = cleaned_names.str.replace(r'"[^"]*"', '', regex=True).str.strip()

    # 2. Extract the part of the name after the comma
    name_part = cleaned_names.str.split(',').str.get(1)

    # 3. Extract the Title from the remaining part of the name
    extracted_title = name_part.str.split('.').str.get(0).str.strip()
    
    # 4. Standardize the common titles
    title_mapping = {
        'Mlle': 'Miss',
        'Ms': 'Miss',
        'Mme': 'Mrs'
    }
    extracted_title = extracted_title.replace(title_mapping)

    # 5. Define a list of common titles
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']

    # 6. Create the 'Title' column directly on the DataFrame
    # Categorize any title not in common_titles as 'Rare'
    df['Titel'] = extracted_title.apply(lambda x: x if x in common_titles else 'Rare')
    df.drop('Name', axis=1, inplace=True)

In [8]:

add_title_column(train)
add_title_column(test)



In [9]:
test['Titel'].value_counts()

Titel
Mr        240
Miss       79
Mrs        72
Master     21
Rare        6
Name: count, dtype: int64

In [16]:
y = train['Survived']
train.drop(['Survived','Cabin','Embarked', 'Ticket'], axis=1, inplace=True)

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin
import itertools
import warnings

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
# Prevents wrapping by allowing unlimited horizontal width
 

In [23]:
class GroupedMedianImputer(BaseEstimator, TransformerMixin):
    """
    A custom transformer to impute missing values in 'Age' based on the
    median of groups. It expects 'Title' to be pre-calculated.
    """
    def __init__(self, group_cols):
        self.group_cols = group_cols
        self.medians = {}
        self.global_median = 0

    def fit(self, X, y=None):
        X_ = X.copy()
        # Calculate medians based on the specified grouping columns
        if self.group_cols:
            self.medians = X_.groupby(self.group_cols)['Age'].median()
        self.global_median = X_['Age'].median()
        return self

    def transform(self, X):
        X_ = X.copy()
        
        # Impute 'Age'
        if self.group_cols:
            X_['Age'] = X_.groupby(self.group_cols)['Age'].transform(lambda x: x.fillna(x.median()))
        
        # Fill any remaining NaNs (for groups that might not be in the test set)
        X_['Age'] = X_['Age'].fillna(self.global_median)
        
        # Drop non-numeric columns used for grouping to prepare data for the model
        X_ = X_.select_dtypes(include=np.number)
        return X_

# 1. Imports are handled above

# 2. Setup
# a. Define columns to process
columns_to_process = ['Age']

# b. Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# c. Define evaluation models
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# d. Define techniques dictionary
techniques = {}
grouping_features = ['Pclass', 'Titel'] 
for i in range(1, len(grouping_features) + 1):
    for combo in itertools.combinations(grouping_features, i):
        combo_list = list(combo)
        name = f"Median Impute by {'_&_'.join(combo_list)}"
        techniques[name] = GroupedMedianImputer(group_cols=combo_list)

# 3. Execution and Evaluation
# This dictionary will hold all results for final comparison
all_results_train = pd.DataFrame()

for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---\n")
    results = {}
    
    # Create a numeric-only version for baseline and feature-dropped scenarios
    train_numeric = train.select_dtypes(include=np.number)
    
    # b. Baseline Evaluation
    # For a fair baseline, we'll impute 'Age' with the global median
    train_baseline = train_numeric.copy()
    train_baseline['Age'] = train_baseline['Age'].fillna(train_numeric['Age'].median())
    
    baseline_scores = cross_validate(model, train_baseline, y, cv=cv, scoring='accuracy', return_train_score=True)
    results['Baseline (Global Median)'] = {
        'Train Score Mean': baseline_scores['train_score'].mean(),
        'Train Score Std': baseline_scores['train_score'].std(),
        'CV Score Mean': baseline_scores['test_score'].mean(),
        'CV Score Std': baseline_scores['test_score'].std()
    }

    # c. Technique Evaluation Loop
    for name, imputer in techniques.items():
        # The imputer is the first step, it receives the train with 'Title' and outputs a numeric train
        pipeline = Pipeline(steps=[
            ('imputer', imputer),
            ('model', model)
        ])
        
        # Perform cross-validation on the pipeline
        # The full train (with 'Title') is passed to the pipeline
        scores = cross_validate(pipeline, train, y, cv=cv, scoring='accuracy', return_train_score=True)
        
        results[name] = {
            'Train Score Mean': scores['train_score'].mean(),
            'Train Score Std': scores['train_score'].std(),
            'CV Score Mean': scores['test_score'].mean(),
            'CV Score Std': scores['test_score'].std()
        }

    # d. Feature Dropped Evaluation
    train_dropped = train_numeric.drop(columns=columns_to_process)
    dropped_scores = cross_validate(model, train_dropped, y, cv=cv, scoring='accuracy', return_train_score=True)
    results['Feature Dropped'] = {
        'Train Score Mean': dropped_scores['train_score'].mean(),
        'Train Score Std': dropped_scores['train_score'].std(),
        'CV Score Mean': dropped_scores['test_score'].mean(),
        'CV Score Std': dropped_scores['test_score'].std()
    }

    # 4. Conclusion for the current model
    # Convert results to a DataFrame for clear comparison
    model_results_train = pd.DataFrame.from_dict(results, orient='index')
    model_results_train['Model'] = model_name
    model_results_train = model_results_train.sort_values(by='CV Score Mean', ascending=False)
    all_results_train = pd.concat([all_results_train, model_results_train])


# Final Conclusion: Print all results
print("--- Final Comparison Across All Models ---\n")
# Format the output for better readability
pd.set_option('display.float_format', '{:.4f}'.format)
print(all_results_train[['Model', 'CV Score Mean', 'CV Score Std', 'Train Score Mean', 'Train Score Std']])

--- Evaluating Model: Logistic Regression ---

--- Evaluating Model: Random Forest ---

--- Final Comparison Across All Models ---

                                               Model  CV Score Mean  \
Median Impute by Pclass_&_Titel  Logistic Regression         0.8013   
Median Impute by Pclass          Logistic Regression         0.7991   
Median Impute by Titel           Logistic Regression         0.7968   
Baseline (Global Median)         Logistic Regression         0.7957   
Feature Dropped                  Logistic Regression         0.7946   
Baseline (Global Median)               Random Forest         0.8126   
Median Impute by Pclass                Random Forest         0.8002   
Median Impute by Pclass_&_Titel        Random Forest         0.8002   
Feature Dropped                        Random Forest         0.7991   
Median Impute by Titel                 Random Forest         0.7812   

                                 CV Score Std  Train Score Mean  \
Median Impute by Pc

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
column_to_encode = 'Titel' 
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# 2. Fit and transform the data
# The double brackets around the column name create a DataFrame slice, which is the expected input format.
one_hot_encoded = ohe.fit_transform(train[[column_to_encode]])

# 3. Create a new DataFrame with the one-hot encoded columns
# The `get_feature_names_out()` method provides meaningful names for the new columns.
one_hot_df = pd.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out([column_to_encode]))

# 4. Concatenate the new DataFrame with the original DataFrame
# We use `train.index` to ensure the rows align correctly.
train = pd.concat([train, one_hot_df], axis=1)

# 5. Drop the original column
train = train.drop(column_to_encode, axis=1)

In [None]:
one_hot_encoded = ohe.transform(test[[column_to_encode]])

# 3. Create a new DataFrame with the one-hot encoded columns
# The `get_feature_names_out()` method provides meaningful names for the new columns.
one_hot_df = pd.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out([column_to_encode]))

# 4. Concatenate the new DataFrame with the original DataFrame
# We use `test.index` to ensure the rows align correctly.
test = pd.concat([test, one_hot_df], axis=1)

# 5. Drop the original column
test = test.drop(column_to_encode, axis=1)

In [None]:
train.info()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import itertools
import warnings

In [None]:
train.columns