In [109]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [110]:
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

## --- Load Fully Processed Data ---

In [111]:
data_path = os.path.join('..', 'data', 'processed', 'titanic_fully_processed.csv')
# 1. Load your final, fully processed dataset
try:
    df = pd.read_csv(data_path)
    print("Processed dataset loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}")
    print("Please run the first two scripts to create 'titanic_processed.csv'.")
    exit()
df.head()

Processed dataset loaded successfully.


Unnamed: 0,PassengerId,Survived,Pclass,CabinAssigned,FamilySize,Age_Class,Fare_per_Person,Sex_male,Embarked_Q,Embarked_S,...,Ticket_Prefix_SOTONOQ,Ticket_Prefix_SP,Ticket_Prefix_STONO,Ticket_Prefix_STONO2,Ticket_Prefix_STONOQ,Ticket_Prefix_SWPP,Ticket_Prefix_WC,Ticket_Prefix_WEP,FamilySize_Group_Large,FamilySize_Group_Medium
0,1,0.0,3,0,2,66.0,3.625,True,False,True,...,False,False,False,False,False,False,False,False,False,True
1,2,1.0,1,1,2,38.0,35.64165,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,3,1.0,3,0,1,78.0,7.925,False,False,True,...,False,False,False,True,False,False,False,False,False,False
3,4,1.0,1,1,2,35.0,26.55,False,False,True,...,False,False,False,False,False,False,False,False,False,True
4,5,0.0,3,0,1,105.0,8.05,True,False,True,...,False,False,False,False,False,False,False,False,False,False


## Final Data Preparation for Modeling ---

In [112]:
# Separate the data back into train and test sets
test_df = df[df['Survived'].isna()].copy()
train_df = df[df['Survived'].notna()].copy()
train_df['Survived'] = train_df['Survived'].astype(int)

# Separate PassengerIds for submission
test_passenger_ids = test_df['PassengerId']

# Final features (X) and target (y)
X = train_df.drop(['Survived', 'PassengerId'], axis=1)
y = train_df['Survived']
X_test = test_df.drop(['Survived', 'PassengerId'], axis=1)


print(f"shape of train_df is: {train_df.shape}")
print(f"shape of test_df is: {test_df.shape}")


shape of train_df is: (891, 61)
shape of test_df is: (418, 61)


In [113]:
# Align columns - crucial for when test set is missing a category from train set
train_cols = X.columns
test_cols = X_test.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0
X_test = X_test[train_cols] # Ensure order is the same

In [114]:
# Scale numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)
print("Data prepared and scaled for modeling.")


Data prepared and scaled for modeling.


## Stacking Ensemble Modeling ---

In [115]:
# Define the Level 0 Base Models
base_models = [
    ('RandomForest', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
    ('CatBoost', CatBoostClassifier(iterations=500, verbose=0, random_state=42)),
    ('XGBoost', XGBClassifier(n_estimators=200, max_depth=3, use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('SVC', SVC(probability=True, random_state=42))
]

In [116]:
# Define the Level 1 Meta-Model
meta_model = LogisticRegression(max_iter=1000)

# Create the Stacking Classifier
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

In [117]:
# Define a hyperparameter grid for the Stacking model
params = {
    'RandomForest__n_estimators': [100, 200],
    'CatBoost__learning_rate': [0.05, 0.1],
    'SVC__C': [0.1, 1.0],
    'final_estimator__C': [0.1, 1.0, 10.0]
}

# Use GridSearchCV to find the best parameters for the entire stack
cv_method = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=params,
    cv=cv_method,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [118]:
print("Starting GridSearchCV for the StackingClassifier...")
grid_search.fit(X, y)

print("\n--- Tuning Complete ---")
print(f"Best Mean CV Accuracy: {grid_search.best_score_:.4f}")
print(f"Best Parameters: {grid_search.best_params_}")

Starting GridSearchCV for the StackingClassifier...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

--- Tuning Complete ---
Best Mean CV Accuracy: 0.8485
Best Parameters: {'CatBoost__learning_rate': 0.1, 'RandomForest__n_estimators': 100, 'SVC__C': 1.0, 'final_estimator__C': 0.1}


## --- Final Prediction and Submission ---

In [119]:
# The best estimator from GridSearchCV is already trained on the full data
final_model = grid_search.best_estimator_

# Make predictions on the final test set
final_predictions = final_model.predict(X_test)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': final_predictions
})

In [120]:
# Save the submission file
submission_df.to_csv('submission_advanced.csv', index=False)

print("\nSubmission file 'submission_advanced.csv' created successfully!")
print("Sample of submission file:")
print(submission_df.head())


Submission file 'submission_advanced.csv' created successfully!
Sample of submission file:
     PassengerId  Survived
891          892         0
892          893         0
893          894         0
894          895         0
895          896         1
