# Titanic: End-to-End Pipeline and Submission

This notebook implements a complete pipeline for data processing, feature engineering, model training, and generating a Kaggle submission file.

## Import Libraries and Modules

Import all required libraries and custom modules for data processing, visualization, and modeling.

In [1]:
import joblib
import pandas as pd

from config import RESULTS_PATH, TEST_DATA_PATH, TRAIN_DATA_PATH, MODEL_PATH

from utils.visualisation import display_df, summarize_df
from utils.feature_engineering import fill_missing_values
from utils.feature_engineering import extract_title, create_family_features, simplify_deck, create_age_group, create_fare_band, create_ticket_prefix
from utils.feature_engineering import normalize_deck, normalize_ticket_prefix
from utils.feature_engineering import normalize_rare_categories

In [2]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Feature Engineering Function

Define a function that applies all feature engineering steps to the input DataFrame.

In [3]:
def extract_features(df):
    df = df.copy()
    df = extract_title(df)
    df = fill_missing_values(df)
    df = create_family_features(df)
    df = simplify_deck(df)
    df = create_age_group(df)
    df = create_fare_band(df)
    df = create_ticket_prefix(df)
    df = normalize_deck(df)
    df = normalize_ticket_prefix(df)
    df = normalize_rare_categories(df, col="Title", min_count=10)
    return df

## Preprocessing Pipeline

Create transformers for ordinal, categorical, and numerical features, and assemble them into a single preprocessing pipeline.

In [29]:
# --- Feature Engineering Transformer ---
feature_engineering = FunctionTransformer(extract_features, validate=False)

# --- Preprocessing ---
ordinal_features = ["AgeGroup"]
ordinal_categories = [["baby", "kids<3", "kids<12", "teenager", "young", "adult", "senior"]]
categorical_ohe_features = ["Title", "Sex", "TicketPrefix", "Deck", "Embarked", "IsAlone"]
numerical_features = ["Pclass", "Age", "SibSp", "Parch", "FamilySize", "FareBand"]

ordinal_transformer = OrdinalEncoder(
    categories=ordinal_categories,
    handle_unknown='use_encoded_value',
    unknown_value=-1,
    dtype=int
)
ohe_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
scaler = StandardScaler()

preprocessor = ColumnTransformer(transformers=[
    ("ordinal", ordinal_transformer, ordinal_features),
    ("ohe", ohe_transformer, categorical_ohe_features),
    ("num", scaler, numerical_features)
])

display(preprocessor)
joblib.dump(preprocessor, MODEL_PATH + "preprocessor.pkl")

['./data/processed/preprocessor.pkl']

## Model Training and Pipeline Saving

Load the training data, fit the complete pipeline (feature engineering + preprocessing + model), and save it for future use.

In [None]:
# ModelLoading
from xgboost import XGBClassifier
best_model = joblib.load(MODEL_PATH + "best_model_xgb_optuna.pkl")

# --- Full Pipeline ---
full_pipeline = Pipeline([
    ("feature_engineering", feature_engineering),
    ("preprocessing", preprocessor),
    ("model", best_model)
])

df_train = pd.read_csv(TRAIN_DATA_PATH)
X_train = df_train.drop("Survived", axis=1)
y_train = df_train["Survived"]

full_pipeline.fit(X_train, y_train)

display(full_pipeline)
joblib.dump(full_pipeline, MODEL_PATH + "full_pipeline.pkl")

🔁 Deck: Replaced 3 rare categories with 'Other': ['F', 'G', 'T']
🔁 TicketPrefix: Replaced 39 rare categories with 'Other': ['A./5.', 'A.5.', 'A/4', 'A/4.', 'A/5.', 'A/S', 'A4.', 'C', 'C.A./SOTON', 'CA', 'CA.', 'F.C.', 'F.C.C.', 'Fa', 'LINE', 'P/PP', 'PP', 'S.C./A.4.', 'S.C./PARIS', 'S.O./P.P.', 'S.O.C.', 'S.O.P.', 'S.P.', 'S.W./PP', 'SC', 'SC/AH', 'SC/PARIS', 'SC/Paris', 'SCO/W', 'SO/C', 'SOTON/O.Q.', 'SOTON/O2', 'SOTON/OQ', 'STON/O2.', 'SW/PP', 'W./C.', 'W.E.P.', 'W/C', 'WE/P']
🔁 Title: Replaced 5 rare categories with 'Other': ['Countess', 'Dr', 'Officer', 'Rev', 'Sir']


['./data/processed/full_pipeline.pkl']

## Kaggle Submission Creation

Load the test data, apply the trained pipeline, and generate the submission file for Kaggle.

In [None]:
df_test = pd.read_csv(TEST_DATA_PATH)
full_pipeline = joblib.load(MODEL_PATH + "full_pipeline.pkl")

predictions = full_pipeline.predict(df_test)

# Kaggle
submission_df = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": predictions.astype(int)
})

# 💾 Save to CSV
submission_df.to_csv(RESULTS_PATH+"predictions.csv", index=False)
print("✅ Submission file saved as predictions.csv")

🔁 Deck: Replaced 5 rare categories with 'Other': ['A', 'D', 'E', 'F', 'G']
🔁 TicketPrefix: Replaced 30 rare categories with 'Other': ['A.', 'A./5.', 'A.5.', 'A/4', 'A/5', 'A/5.', 'AQ/3.', 'AQ/4', 'C', 'CA', 'CA.', 'F.C.', 'F.C.C.', 'LP', 'PP', 'S.C./PARIS', 'S.O./P.P.', 'S.O.C.', 'SC', 'SC/A.3', 'SC/A4', 'SC/AH', 'SC/Paris', 'SOTON/O2', 'SOTON/OQ', 'STON/O', 'STON/O2.', 'STON/OQ.', 'W./C.', 'W.E.P.']
🔁 Title: Replaced 4 rare categories with 'Other': ['Dr', 'Officer', 'Other', 'Rev']
✅ Submission file saved as predictions.csv
