# Spaceship Titanic: Group-aware baseline with clean feature pipeline

**Model**: scikit-learn `HistGradientBoostingClassifier` with one-hot encoded categoricals

**Key ideas**:
- Parse `PassengerId` into `Group` and `MemberNum`.
- Split `Cabin` into `Deck`, `CabinNum`, `Side`.
- Spend features and totals: `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck`, `TotalSpend`, `IsZeroSpend`, `SpendPerAge`.
- Group-aware CV using `StratifiedGroupKFold` over `Group`.
- Safe one-hot encode with `handle_unknown='ignore'`.

This notebook trains, evaluates, and writes `submission.csv` for Kaggle.


In [1]:
from __future__ import annotations
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Tuple

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Detect Kaggle paths
KAGGLE_DIR = Path('/kaggle')
IN_KAGGLE = KAGGLE_DIR.exists()
DATA_DIR = Path('/kaggle/input/spaceship-titanic') if IN_KAGGLE else Path('.')
WORK_DIR = Path('/kaggle/working') if IN_KAGGLE else Path('.')
print({'IN_KAGGLE': IN_KAGGLE, 'DATA_DIR': str(DATA_DIR), 'WORK_DIR': str(WORK_DIR)})


{'IN_KAGGLE': True, 'DATA_DIR': '/kaggle/input/spaceship-titanic', 'WORK_DIR': '/kaggle/working'}


## Load data

In [2]:
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'
assert train_path.exists(), f"Missing {train_path}. On Kaggle, add this notebook to the competition dataset."
assert test_path.exists(), f"Missing {test_path}. On Kaggle, add this notebook to the competition dataset."

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


## Feature engineering
We create group and cabin features, spend totals, simple flags, and surname frequency bins.


In [3]:
def engineer_basic(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # PassengerId -> Group and MemberNum
    pid = df['PassengerId'].str.split('_', expand=True)
    df['Group'] = pd.to_numeric(pid[0], errors='coerce')
    df['MemberNum'] = pd.to_numeric(pid[1], errors='coerce')

    # Cabin -> Deck, CabinNum, Side
    cabin = df['Cabin'].astype('string').str.split('/', expand=True)
    if cabin.shape[1] == 3:
        df['Deck'] = cabin[0]
        df['CabinNum'] = pd.to_numeric(cabin[1], errors='coerce')
        df['Side'] = cabin[2]
    else:
        df['Deck'] = pd.NA
        df['CabinNum'] = pd.NA
        df['Side'] = pd.NA

    # Surname (last token); some rows may be NaN
    def _surname(x):
        if isinstance(x, str) and x:
            parts = x.split()
            return parts[-1]
        return ''
    df['Surname'] = df['Name'].apply(_surname)

    # Spend features
    spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
    for c in spend_cols:
        if c not in df.columns:
            df[c] = 0.0
    df['TotalSpend'] = df[spend_cols].sum(axis=1, skipna=True)
    df['IsZeroSpend'] = (df[spend_cols].fillna(0).sum(axis=1) == 0).astype(int)
    df['SpendPerAge'] = df['TotalSpend'] / df['Age'].replace({0: np.nan})

    # Child flag
    df['IsChild'] = (df['Age'] < 12).astype('Int64')

    return df

def add_group_and_surname_agg(full: pd.DataFrame) -> pd.DataFrame:
    full = full.copy()
    # Group size over full set (safe, no label leakage)
    full['GroupSize'] = full.groupby('Group')['PassengerId'].transform('count')
    full['IsAlone'] = (full['GroupSize'] == 1).astype(int)

    # Surname frequency bins
    surname_counts = full['Surname'].replace('', np.nan).value_counts(dropna=True)
    full['SurnameFreq'] = full['Surname'].map(surname_counts).fillna(0).astype(int)
    full['SurnameFreqBin'] = pd.cut(
        full['SurnameFreq'], bins=[-1, 1, 3, 10, 10**9], labels=['1','2-3','4-10','10+']
    )
    return full

# Build engineered full frame to keep consistent encodings
full = pd.concat([
    train_df.drop(columns=['Transported']),
    test_df
], axis=0, ignore_index=True)

full_eng = engineer_basic(full)
full_eng = add_group_and_surname_agg(full_eng)
full_eng.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Side,Surname,TotalSpend,IsZeroSpend,SpendPerAge,IsChild,GroupSize,IsAlone,SurnameFreq,SurnameFreqBin
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,P,Ofracculy,0.0,1,0.0,0,1,1,3,2-3
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,S,Vines,736.0,0,30.666667,0,1,1,4,4-10
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,S,Susent,10383.0,0,179.017241,0,2,0,7,4-10


## Split back to train/test and define feature lists

In [4]:
eng_train = full_eng.iloc[:len(train_df)].copy()
eng_test  = full_eng.iloc[len(train_df):].copy()

y = train_df['Transported'].astype(bool).astype(int)  # 1 for True, 0 for False
groups = eng_train['Group'].fillna(-1).astype(int)

numeric_features = [
    'Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck',
    'TotalSpend','SpendPerAge','MemberNum','CabinNum','GroupSize','IsAlone','IsZeroSpend','IsChild'
]
categorical_features = [
    'HomePlanet','Destination','CryoSleep','VIP','Deck','Side','SurnameFreqBin'
]

# Keep keys for later
id_col = 'PassengerId'
X_train = eng_train[numeric_features + categorical_features]
X_test  = eng_test[numeric_features + categorical_features]
X_train.shape, X_test.shape

((8693, 21), (4277, 21))

## Pipeline: impute + one-hot + HistGradientBoostingClassifier

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedGroupKFold

num_imputer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])
cat_imputer_encoder = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocess = ColumnTransformer([
    ('num', num_imputer, numeric_features),
    ('cat', cat_imputer_encoder, categorical_features)
])

clf = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_leaf_nodes=31,
    min_samples_leaf=20,
    l2_regularization=0.0,
    max_depth=None,
    early_stopping=True,
    random_state=RANDOM_STATE
)

pipe = Pipeline([
    ('prep', preprocess),
    ('clf', clf)
])
pipe

## Group-aware CV accuracy (5 folds)

In [12]:
# ## Group-aware CV accuracy (5 folds) — hardened

# 1) Sanitize feature matrices so SimpleImputer never sees pd.NA
def _sanitize_frame(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()

    # Ensure categorical block is plain 'object' and has np.nan (not pd.NA)
    for c in categorical_features:
        if c in X.columns:
            X[c] = X[c].astype('object')

    # Replace pandas <NA> everywhere with np.nan
    X = X.replace({pd.NA: np.nan})

    # Force numeric block to float64
    X[numeric_features] = (
        X[numeric_features]
        .apply(pd.to_numeric, errors='coerce')
        .astype('float64')
    )
    return X

X_train_s = _sanitize_frame(X_train)
X_test_s  = _sanitize_frame(X_test)

# Optional sanity checks
assert all(str(dt) == 'float64' for dt in X_train_s[numeric_features].dtypes), "Numeric dtypes not float64"
assert not any(X_train_s[numeric_features].isna().all()), "All-NaN numeric column detected"
# Categorical can be object with NaNs; the cat imputer will handle them.

# 2) Group-aware CV
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
fold_acc = []

for fold, (tr, va) in enumerate(cv.split(X_train_s, y, groups=groups), 1):
    X_tr, X_va = X_train_s.iloc[tr], X_train_s.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]
    pipe.fit(X_tr, y_tr)
    pred = pipe.predict(X_va)
    acc = accuracy_score(y_va, pred)
    fold_acc.append(acc)
    print(f"Fold {fold}: accuracy={acc:.5f}")

print(f"CV mean={np.mean(fold_acc):.5f} ± {np.std(fold_acc):.5f}")


Fold 1: accuracy=0.79375
Fold 2: accuracy=0.81357
Fold 3: accuracy=0.81325
Fold 4: accuracy=0.81481
Fold 5: accuracy=0.80112
CV mean=0.80730 ± 0.00840


## Fit on full training data and predict test

In [14]:
# ## Fit on full training data and predict test — sanitized

# Reuse sanitized frames from CV; if missing, rebuild them.
try:
    X_train_s, X_test_s
except NameError:
    def _sanitize_frame(X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        # categorical as object, numeric as float with np.nan
        for c in categorical_features:
            if c in X.columns:
                X[c] = X[c].astype('object')
        X = X.replace({pd.NA: np.nan})
        X[numeric_features] = (
            X[numeric_features].apply(pd.to_numeric, errors='coerce').astype('float64')
        )
        return X

    X_train_s = _sanitize_frame(X_train)
    X_test_s  = _sanitize_frame(X_test)

# Fit on full training data
pipe.fit(X_train_s, y)

# Predict test and build submission
test_pred_class = pipe.predict(X_test_s).astype(bool)
sub = pd.DataFrame({
    'PassengerId': eng_test['PassengerId'].values,
    'Transported': test_pred_class
})

sub_path = (WORK_DIR / 'submission.csv') if IN_KAGGLE else Path('submission.csv')
sub.to_csv(sub_path, index=False)
print(f"Wrote {sub_path.resolve()}")
sub.head()


Wrote /kaggle/working/submission.csv


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


## Save trained model for reuse

In [15]:
import joblib
model_path = (WORK_DIR / 'model_histgb.pkl') if IN_KAGGLE else Path('model_histgb.pkl')
joblib.dump(pipe, model_path)
print(f"Saved model to {model_path.resolve()}")

Saved model to /kaggle/working/model_histgb.pkl


## Notes and next steps
- Try `CatBoostClassifier` with raw categoricals to capture interactions without full one-hot.
- Hyperparameter search: `skopt` or `Optuna` with small budgets.
- Blend two diverse models (HistGB + CatBoost) by simple average of probabilities.
- Add interaction features: spend ratios, Deck×Side, Destination×HomePlanet.
- Consider Platt scaling or isotonic calibration if you need calibrated probabilities.

In [16]:
# ## CatBoost baseline — group-aware CV

# 0) Install/import
try:
    from catboost import CatBoostClassifier, Pool
except Exception:
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "catboost>=1.2.5"], check=True)
    from catboost import CatBoostClassifier, Pool

# 1) Prep helpers
features = numeric_features + categorical_features
cat_idx = [features.index(c) for c in categorical_features]

def prep_cb(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # Replace pandas <NA> everywhere
    X = X.replace({pd.NA: np.nan})
    # Numeric → float64
    X[numeric_features] = (
        X[numeric_features].apply(pd.to_numeric, errors='coerce').astype('float64')
    )
    # Categoricals → plain object strings with 'NA' sentinel
    for c in categorical_features:
        X[c] = X[c].astype('object')
        X[c] = X[c].where(X[c].notna(), 'NA')
    return X[features]

X_train_cb = prep_cb(eng_train[features])
X_test_cb  = prep_cb(eng_test[features])

# 2) CV
params = dict(
    loss_function="Logloss",
    eval_metric="Accuracy",
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3.0,
    iterations=2000,
    random_seed=RANDOM_STATE,
    od_type="Iter",
    od_wait=100,
    verbose=False,
)

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
fold_acc = []

for fold, (tr, va) in enumerate(cv.split(X_train_cb, y, groups=groups), 1):
    train_pool = Pool(X_train_cb.iloc[tr], y.iloc[tr], cat_features=cat_idx)
    valid_pool = Pool(X_train_cb.iloc[va], y.iloc[va], cat_features=cat_idx)

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=False)

    pred = model.predict(valid_pool)  # 0/1
    acc = accuracy_score(y.iloc[va], pred.astype(int))
    fold_acc.append(acc)
    print(f"Fold {fold}: accuracy={acc:.5f}")

print(f"CV mean={np.mean(fold_acc):.5f} ± {np.std(fold_acc):.5f}")


Fold 1: accuracy=0.78491
Fold 2: accuracy=0.82440
Fold 3: accuracy=0.82182
Fold 4: accuracy=0.82128
Fold 5: accuracy=0.81341
CV mean=0.81316 ± 0.01460


In [17]:
# ## CatBoost — fit full training and predict test

full_pool = Pool(X_train_cb, y, cat_features=cat_idx)

# For final model we train on all data; no early stopping set
final_params = dict(
    loss_function="Logloss",
    eval_metric="Accuracy",
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3.0,
    iterations= int(1.1 *  np.mean([2000])) ,  # simple, stable budget
    random_seed=RANDOM_STATE,
    verbose=False,
)
final_model = CatBoostClassifier(**final_params)
final_model.fit(full_pool, verbose=False)

test_pool = Pool(X_test_cb, cat_features=cat_idx)
test_pred = final_model.predict(test_pool).astype(int).astype(bool)

sub = pd.DataFrame({
    'PassengerId': eng_test['PassengerId'].values,
    'Transported': test_pred
})

sub_path = (WORK_DIR / 'submission.csv') if IN_KAGGLE else Path('submission.csv')
sub.to_csv(sub_path, index=False)
print(f"Wrote {sub_path.resolve()}")
sub.head()

Wrote /kaggle/working/submission.csv


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
