In [11]:
import pandas as pd
import numpy as np
from pathlib import Path

In [12]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # ---------- Group ----------
    # PassengerId format: gggg_pp
    group_str = df["PassengerId"].astype("string").str.split("_").str[0]
    df["GroupId"] = pd.to_numeric(group_str, errors="coerce").astype("Int64")

    # Group size (computed within the provided df; do this on train+test concatenated if you want full info)
    group_size = df.groupby("GroupId")["PassengerId"].transform("size")
    df["GroupSizeCapped"] = group_size.clip(upper=6)
    df["GroupSizeCapped"] = df["GroupSizeCapped"].astype("Int64").fillna(-1)
    df["IsGroupSize6Plus"] = df["GroupSizeCapped"].gt(5)

    # ---------- Cabin ----------
    cabin_parts = df["Cabin"].astype("string").str.split("/", expand=True)

    df["CabinDeck"] = cabin_parts.get(0).astype("string").fillna("Missing")
    df["CabinSide"] = cabin_parts.get(2).astype("string").fillna("Missing")

    cabin_num = pd.to_numeric(cabin_parts.get(1), errors="coerce")
    df["CabinNumMissing"] = cabin_num.isna().astype(int)
    df["CabinNum"] = cabin_num.fillna(-1).astype(int)

    # Bin CabinNum (keep -1 as its own bin label)
    bin_size = 50
    max_num = int(df.loc[df["CabinNum"] >= 0, "CabinNum"].max()) if (df["CabinNum"] >= 0).any() else 0
    bins = np.arange(0, max_num + bin_size, bin_size)
    df["CabinNum_bin"] = pd.cut(
        df["CabinNum"].where(df["CabinNum"] >= 0, np.nan),  # exclude -1 from numeric bins
        bins=bins,
        right=False,
        include_lowest=True
    ).astype("string").fillna("Missing")

    # ---------- Categoricals ----------
    df["HomePlanet"] = df["HomePlanet"].astype("string").fillna("Missing")
    df["Destination"] = df["Destination"].astype("string").fillna("Missing")

    # ---------- Booleans (treat as categorical with Missing) ----------
    # Keep as strings to one-hot later; avoids int conversion issues
    df["CryoSleep"] = df["CryoSleep"].astype("string").fillna("Missing")
    df["VIP"] = df["VIP"].astype("string").fillna("Missing")

    # ---------- Age ----------
    df["AgeMissing"] = df["Age"].isna().astype(int)
    df["Age"] = df["Age"].fillna(-1)  # or median, depending on model

    # ---------- Expenses ----------
    expense_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

    df["SpendMissingCount"] = df[expense_cols].isna().sum(axis=1).astype(int)

    filled = df[expense_cols].fillna(0)
    df["NumAmenitiesUsed"] = filled.gt(0).sum(axis=1).astype(int)
    df["TotalExpenses"] = filled.sum(axis=1)

    df["NoSpend"] = (df["TotalExpenses"] == 0).astype(int)
    df["TotalExpenses_log"] = np.log1p(df["TotalExpenses"])

    # Return only model features + GroupId for GroupKFold
    keep = [
        "CryoSleep", "HomePlanet", "Destination",
        "CabinDeck", "CabinSide", "CabinNum_bin",
        "Age", "AgeMissing",
        "VIP",
        "GroupSizeCapped", "IsGroupSize6Plus", "GroupId",
        "NoSpend", "SpendMissingCount", "NumAmenitiesUsed", "TotalExpenses_log",
    ]
    return df[keep]

In [13]:
def find_project_root(start: Path | None = None) -> Path:
    """Walk upward until we find pyproject.toml (repo root)."""
    start = start or Path.cwd()
    for p in [start, *start.parents]:
        if (p / "pyproject.toml").exists():
            return p
    return start

PROJECT_DIR = find_project_root()
DATA_DIR = PROJECT_DIR / "data"

train_path = DATA_DIR / "train.csv"
test_path = DATA_DIR / "test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df_features = make_features(train_df)
test_df_features = make_features(test_df)

train_df_features.dtypes

CryoSleep            string[python]
HomePlanet           string[python]
Destination          string[python]
CabinDeck            string[python]
CabinSide            string[python]
CabinNum_bin         string[python]
Age                         float64
AgeMissing                    int64
VIP                  string[python]
GroupSizeCapped               Int64
IsGroupSize6Plus            boolean
GroupId                       Int64
NoSpend                       int64
SpendMissingCount             int64
NumAmenitiesUsed              int64
TotalExpenses_log           float64
dtype: object

In [None]:
# --- YDF CV (GroupKFold) + final training + submission ---

# YDF setup
# Note: TF-DF and TF itself can be platform/version picky; YDF is the successor library.

import sys

try:
    import ydf
except ModuleNotFoundError as e:
    raise RuntimeError(
        "YDF is not installed. In WSL: poetry install --with tfdf (which brings ydf)"
    ) from e

from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score

RANDOM_SEED = 42
N_SPLITS = 5

# Labels
y = train_df["Transported"].astype(int)

# Groups (do NOT feed to model)
groups = train_df_features["GroupId"]

# Features (keep GroupId for splitting only)
X_all = train_df_features

fold_accs: list[float] = []

gkf = GroupKFold(n_splits=N_SPLITS)
for fold, (train_idx, val_idx) in enumerate(gkf.split(X_all, y, groups=groups), start=1):
    X_train = X_all.iloc[train_idx].drop(columns=["GroupId"])
    X_val = X_all.iloc[val_idx].drop(columns=["GroupId"])

    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]

    train_fold_df = X_train.assign(Transported=y_train)
    val_fold_df = X_val.assign(Transported=y_val)

    model = ydf.GradientBoostedTreesLearner(
        label="Transported",
        task=ydf.Task.CLASSIFICATION,
        random_seed=RANDOM_SEED,
    ).train(train_fold_df)

    # YDF outputs probabilities for the positive class in binary classification
    pred = model.predict(val_fold_df.drop(columns=["Transported"]))
    pred_label = (pred >= 0.5).astype(int)

    acc = accuracy_score(y_val.to_numpy(), pred_label)
    fold_accs.append(acc)
    print(f"Fold {fold}/{N_SPLITS} accuracy: {acc:.5f}")

print(f"Mean CV accuracy: {np.mean(fold_accs):.5f} ± {np.std(fold_accs):.5f}")

# --- Train final model on all rows ---
X_train_full = train_df_features.drop(columns=["GroupId"]).copy()
train_full_df = X_train_full.assign(Transported=y)

final_model = ydf.GradientBoostedTreesLearner(
    label="Transported",
    task=ydf.Task.CLASSIFICATION,
    random_seed=RANDOM_SEED,
).train(train_full_df)

# --- Predict test + write submission ---
X_test = test_df_features.drop(columns=["GroupId"]).copy()

test_pred = final_model.predict(X_test)
test_label = (test_pred >= 0.5)

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Transported": test_label,
})

out_path = PROJECT_DIR / "submission.csv"
submission.to_csv(out_path, index=False)
print(f"Wrote: {out_path}")
submission.head()


sys.version_info(major=3, minor=12, micro=3, releaselevel='final', serial=0)
Train model on 6954 examples
Model trained in 0:00:00.576070
Fold 1/5 accuracy: 0.73260
Train model on 6954 examples
Model trained in 0:00:00.257271
Fold 2/5 accuracy: 0.76366
Train model on 6954 examples
Model trained in 0:00:00.252833
Fold 3/5 accuracy: 0.76136
Train model on 6955 examples
Model trained in 0:00:00.244784
Fold 4/5 accuracy: 0.73475
Train model on 6955 examples
Model trained in 0:00:00.256195
Fold 5/5 accuracy: 0.75316
Mean CV accuracy: 0.74911 ± 0.01309
Train model on 8693 examples
Model trained in 0:00:00.359397
Wrote: /mnt/c/Users/atric/GitHub/SpaceTitanic/submission.csv


Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False
