In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")
test = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\test.csv")

In [3]:
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

In [4]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(train[['Fare']])
test['Fare'] = imputer.transform(test[['Fare']])
train['Fare'] = np.log1p(train['Fare'])
test['Fare'] = np.log1p(test['Fare'])

In [6]:
import re 
def add_title_column(df):
    """
    Cleans the 'Name' column and adds a 'Title' column directly 
    to the input DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to modify in-place.
    """
    # 1. Clean the Name column (remove text in parentheses and quotes)
    # This series is temporary and will be used to build the 'Title'
    cleaned_names = df['Name'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())
    cleaned_names = cleaned_names.str.replace(r'"[^"]*"', '', regex=True).str.strip()

    # 2. Extract the part of the name after the comma
    name_part = cleaned_names.str.split(',').str.get(1)

    # 3. Extract the Title from the remaining part of the name
    extracted_title = name_part.str.split('.').str.get(0).str.strip()
    
    # 4. Standardize the common titles
    title_mapping = {
        'Mlle': 'Miss',
        'Ms': 'Miss',
        'Mme': 'Mrs'
    }
    extracted_title = extracted_title.replace(title_mapping)

    # 5. Define a list of common titles
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']

    # 6. Create the 'Title' column directly on the DataFrame
    # Categorize any title not in common_titles as 'Rare'
    df['Titel'] = extracted_title.apply(lambda x: x if x in common_titles else 'Rare')
    df.drop('Name', axis=1, inplace=True)

In [7]:
# Assuming you have two different dataframes: train_df and test_df
add_title_column(train)
add_title_column(test)



In [8]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Titel           object
dtype: object

In [9]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [10]:
train.drop(['Ticket', 'Cabin', 'Embarked'], axis=1 , inplace = True)
y = train['Survived']
train.drop(['Survived'], axis=1 , inplace = True)


In [11]:
models_to_evaluate = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# --- Main Evaluation Loop ---
# This loop will run the entire evaluation process for each specified model.
for model_name, model_object in models_to_evaluate.items():

    print(f"--- Starting Evaluation for: {model_name} ---")

    # 1. Setup
    columns_to_process = ['Titel']
    # Identify numeric columns automatically to apply imputation
    numeric_features = train.select_dtypes(include=np.number).columns.tolist()

    # Define the cross-validation strategy
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Define the model for this iteration
    model = model_object

    # Define the encoding techniques to be tested
    # Each technique is a pipeline to first handle NaNs and then apply the encoding.
    techniques = {
        'Nominal Encoding (Ordinal)': Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]),
        'One-Hot Encoding': Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
    }

    # 2. Execution and Evaluation
    results = {}
    scoring_objective = 'accuracy' # The primary metric for evaluation

    # a. Baseline Evaluation
    # For the baseline, we will impute numeric columns and drop the categorical column.
    # This provides a reference point for model performance without the feature.
    print("Evaluating: Baseline (Feature Dropped)")
    baseline_train = train.drop(columns=columns_to_process)
    baseline_numeric_features = baseline_train.select_dtypes(include=np.number).columns.tolist()
    
    # The baseline pipeline only needs to handle numeric imputation
    baseline_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', model)
    ])
    
    # Perform cross-validation
    baseline_scores = cross_validate(baseline_pipeline, baseline_train, y, cv=cv,
                                     scoring=scoring_objective, return_train_score=True)
    results['Baseline'] = baseline_scores

    # b. Technique Evaluation Loop
    # Iterate through each defined encoding technique.
    for name, transformer in techniques.items():
        print(f"Evaluating: {name}")

        # Create a preprocessor to handle numeric and categorical columns separately
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', SimpleImputer(strategy='median'), numeric_features),
                ('cat', transformer, columns_to_process)
            ],
            remainder='passthrough' # Keep other columns (if any)
        )

        # Create the full pipeline: preprocess data, then train the model
        main_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        # Perform cross-validation on the full pipeline
        scores = cross_validate(main_pipeline, train, y, cv=cv,
                                scoring=scoring_objective, return_train_score=True)
        results[name] = scores

    # c. Feature Dropped Evaluation
    # This step is explicitly requested. It is functionally identical to our baseline.
    print("Evaluating: Feature Dropped (Explicit Step)")
    train_dropped = train.drop(columns=columns_to_process)
    
    # This pipeline is the same as the baseline pipeline
    dropped_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', model)
    ])

    # Perform cross-validation
    dropped_scores = cross_validate(dropped_pipeline, train_dropped, y, cv=cv,
                                    scoring=scoring_objective, return_train_score=True)
    results['Feature Dropped'] = dropped_scores

    # 3. Conclusion
    # Process the results into a clean DataFrame for comparison.
    summary = {}
    for name, scores in results.items():
        summary[name] = {
            'CV Score Mean': scores['test_score'].mean(),
            'CV Score Std': scores['test_score'].std(),
            'Train Score Mean': scores['train_score'].mean(),
            'Train Score Std': scores['train_score'].std(),
            'Fit Time Mean': scores['fit_time'].mean()
        }

    results_train = pd.DataFrame.from_dict(summary, orient='index')
    results_train = results_train.sort_values(by='CV Score Mean', ascending=False)

    print("\n--- Evaluation Summary ---")
    print(results_train)
    print("\n" + "="*50 + "\n")

--- Starting Evaluation for: Logistic Regression ---
Evaluating: Baseline (Feature Dropped)
Evaluating: Nominal Encoding (Ordinal)
Evaluating: One-Hot Encoding
Evaluating: Feature Dropped (Explicit Step)

--- Evaluation Summary ---
                            CV Score Mean  CV Score Std  Train Score Mean  \
One-Hot Encoding                 0.821568      0.025816          0.825199   
Baseline                         0.788990      0.024082          0.799103   
Feature Dropped                  0.788990      0.024082          0.799103   
Nominal Encoding (Ordinal)       0.784483      0.034348          0.801346   

                            Train Score Std  Fit Time Mean  
One-Hot Encoding                   0.012440       0.033135  
Baseline                           0.005552       0.015230  
Feature Dropped                    0.005552       0.015860  
Nominal Encoding (Ordinal)         0.008183       0.031222  


--- Starting Evaluation for: Random Forest ---
Evaluating: Baseline (Featur