In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
import category_encoders as ce
from sklearn import set_config
import catboost as cb
from catboost import CatBoostClassifier
# Import the classifier you want to use
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score # Good for final evaluation

# =====================================================================================
# PART 1: DEFINE THE COMPLETE PREPROCESSING PIPELINE (your existing code)
# =====================================================================================

def create_title_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts, cleans, and standardizes titles from the 'Name' column.
    """
    df_copy = df.copy()
    name_series = df_copy['Name'].apply(lambda x: re.sub(r'\(([^)]*)\)', '', x).strip())
    name_series = name_series.str.replace(r'\"[^\"]*\"', '', regex=True)
    name_part = name_series.str.split(',').str.get(1).str.strip()
    title = name_part.str.split('.').str.get(0).str.strip()
    title_mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'}
    title = title.replace(title_mapping)
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    title = title.apply(lambda x: x if x in common_titles else 'Rare')
    return title.to_frame(name='Titel')

def extract_cabin_letter(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts the first letter from the 'Cabin' column, converting NaNs to 'n'.
    The output column is named 'Cabin' to match the original script's final output.
    """
    return df['Cabin'].astype(str).str[0].to_frame(name='Cabin')

preprocessor = ColumnTransformer(
    transformers=[
        ('sex_oe', OrdinalEncoder(categories=[['male', 'female']]), ['Sex']),
        ('title_creator', Pipeline(steps=[
            ('create', FunctionTransformer(create_title_feature, feature_names_out=lambda self, input_features: ['Titel'])),
            ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ['Name']),
        ('embarked_impute_encode', Pipeline(steps=[
            ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ['Embarked']),
        ('cabin_extract_encode', Pipeline(steps=[
            ('extract', FunctionTransformer(extract_cabin_letter, feature_names_out=lambda self, input_features: ['Cabin'])),
            ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ['Cabin']),
        # The LeaveOneOutEncoder for 'Ticket'
        ('ticket_loo', ce.LeaveOneOutEncoder(), ['Ticket'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

# -------------------------------------------------
# 1.3. Final Pipeline with Imputer AND the Classifier
# -------------------------------------------------
# This is the key change: adding your final model
titanic_final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', IterativeImputer(min_value=1, random_state=0)),
    ('classifier', CatBoostClassifier(verbose=0, random_state=42)) # Your final model goes here
])

set_config(display='diagram')
print("✅ Pipeline defined successfully. Here is the structure:")
display(titanic_final_pipeline)

# =====================================================================================
# Step 2: Prepare Data and Define Hyperparameter Grid for Tuning
# =====================================================================================

# Load your original, raw data (X and y should be unencoded and un-imputed)
df = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")
X = df.drop(["Survived", "PassengerId"], axis=1) # Keep PassengerId out of X
y = df["Survived"]

# Define the hyperparameter grid for the classifier (and potentially preprocessor steps)
# The syntax for pipeline steps is 'step_name__parameter_name'
param_grid = {
    'classifier__depth': [8, 10, 12],            # Corrected prefix
    'classifier__learning_rate': [0.03, 0.05, 0.07], # Corrected prefix
    'classifier__iterations': [250, 300, 350]   # Corrected prefix
}
    # You could also tune preprocessor steps if they have tunable parameters.
    # For example, if you used TargetEncoder, you could tune 'preprocessor__ticket_loo__smoothing': [1.0, 10.0]
    # But LeaveOneOutEncoder typically doesn't have tunable hyperparameters like smoothing.
    # If you were tuning the IterativeImputer, it would be 'imputer__max_iter': [5, 10]

# =====================================================================================
# Step 3: Set up and Run GridSearchCV
# =====================================================================================

# Use KFold for robust cross-validation
cv_splitter = KFold(n_splits=5, shuffle=True, random_state=42)

# Instantiate GridSearchCV with your full pipeline and parameter grid
grid_search = GridSearchCV(
    titanic_final_pipeline, # Use the pipeline that includes your classifier
    param_grid,
    cv=cv_splitter,
    scoring='accuracy', # Or 'f1', 'roc_auc', etc., appropriate for your problem
    n_jobs=5,          # Use all available CPU cores for parallel processing
    verbose=2           # Set to 1 or 2 for progress updates during fitting
)

print("\nStarting hyperparameter tuning with the complete pipeline...")
grid_search.fit(X, y) # Pass the original, raw X and y here

# =====================================================================================
# Step 4: View the Best Results
# =====================================================================================
print("\nHyperparameter tuning complete.")
print("---")
print("Best parameters found: ")
print(grid_search.best_params_)
print(f"\nBest cross-validation accuracy: {grid_search.best_score_:.4f}")

# You can also access the best estimator (the fitted pipeline with best params)
best_model = grid_search.best_estimator_

# To evaluate on a separate test set (if you held one out initially)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# final_predictions = best_model.predict(X_test)
# final_accuracy = accuracy_score(y_test, final_predictions)
# print(f"Final Model Accuracy on (unseen) Test Set: {final_accuracy:.4f}")

# If you want to make predictions on truly new data after finding the best model
# new_data = pd.DataFrame(...) # Your new, raw data
# predictions_on_new_data = best_model.predict(new_data)

✅ Pipeline defined successfully. Here is the structure:


0,1,2
,steps,"[('preprocessor', ...), ('imputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('sex_oe', ...), ('title_creator', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['male', 'female']]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function cre...002362F8C0860>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function <la...002362F8C0A40>
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function ext...002362F8C0B80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function <la...002362F8C0E00>
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,random_state,
,sigma,

0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False



Starting hyperparameter tuning with the complete pipeline...
Fitting 5 folds for each of 27 candidates, totalling 135 fits

Hyperparameter tuning complete.
---
Best parameters found: 
{'classifier__depth': 8, 'classifier__iterations': 250, 'classifier__learning_rate': 0.03}

Best cross-validation accuracy: 0.8474


In [10]:
test = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\test.csv")

In [11]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [12]:
# Store PassengerId if you need it for submission (e.g., Kaggle)
passenger_ids = test['PassengerId']

# Prepare new data: drop 'PassengerId' as it's not a feature for the model
test = test.drop('PassengerId', axis=1)



# Make predictions using the best_model_pipeline
# The pipeline handles all preprocessing (including LOO, imputation) internally
final_predictions = best_model.predict(test)

print("\nPredictions generated.")
print(f"Shape of predictions: {final_predictions.shape}")


# =====================================================================================
# PART 5: FORMATTING PREDICTIONS (e.g., for Kaggle Submission)
# =====================================================================================

# If you need to create a submission file:
submission_df = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': final_predictions})
# Ensure 'Survived' column is of integer type if it's a classification (0 or 1)
submission_df['Survived'] = submission_df['Survived'].astype(int)

# Save to CSV
submission_file_path = 'titanic_submission_tuned_model.csv'
submission_df.to_csv(submission_file_path, index=False)

print(f"\nSubmission file '{submission_file_path}' created successfully.")
print("First 5 rows of submission file:")
print(submission_df.head())


Predictions generated.
Shape of predictions: (418,)

Submission file 'titanic_submission_tuned_model.csv' created successfully.
First 5 rows of submission file:
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
