In [6]:
!uv sync

[2mResolved [1m143 packages[0m [2min 12ms[0m[0m
[2K[2mPrepared [1m1 package[0m [2min 283ms[0m[0m                                              
[2mUninstalled [1m1 package[0m [2min 0.66ms[0m[0m
[2K[2mInstalled [1m1 package[0m [2min 0.90ms[0m[0m1.0 (from file:///Volumes/MacHo[0m
 [33m~[39m [1mkaggle-submissions[0m[2m==0.1.0 (from file:///Volumes/MacHome/alexulanch/Developer/personal/github/kaggle-submissions)[0m


In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

In [50]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(train.shape, test.shape)

(8693, 14) (4277, 13)


In [51]:
SEED = 42
np.random.seed(SEED)

print(train.dtypes)

print("\nnull rates:")
print(train.isna().mean().sort_values(ascending=False))
print("\nclass balance:")
print(train["Transported"].value_counts(normalize=True).round(3))

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

null rates:
CryoSleep       0.024963
ShoppingMall    0.023927
VIP             0.023352
HomePlanet      0.023122
Name            0.023007
Cabin           0.022892
VRDeck          0.021627
FoodCourt       0.021051
Spa             0.021051
Destination     0.020936
RoomService     0.020821
Age             0.020591
PassengerId     0.000000
Transported     0.000000
dtype: float64

class balance:
Transported
True     0.504
False    0.496
Name: proportion, dtype: float64


In [52]:
train["Transported"] = train["Transported"].astype("Int64")

In [53]:
for df in (train, test):
    df["PassengerId"] = df["PassengerId"].astype(str)
    df["_Group"] = df["PassengerId"].str.split("_", expand=True)[0].astype("Int64")
    df["SeatNum"] = df["PassengerId"].str.split("_", expand=True)[1].astype("Int64")
    df["VIP"] = df["VIP"].fillna(0).astype("Int64")
    df["CryoSleep"] = df["CryoSleep"].fillna(0).astype("Int64")
    df.drop(columns=["PassengerId", "Name"], inplace=True)

In [54]:
train.head(10)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,_Group,SeatNum
0,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,1
1,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,1
2,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,1
3,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,2
4,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,1
5,Earth,0,F/0/P,PSO J318.5-22,44.0,0,0.0,483.0,0.0,291.0,0.0,1,5,1
6,Earth,0,F/2/S,TRAPPIST-1e,26.0,0,42.0,1539.0,3.0,0.0,0.0,1,6,1
7,Earth,1,G/0/S,TRAPPIST-1e,28.0,0,0.0,0.0,0.0,0.0,,1,6,2
8,Earth,0,F/3/S,TRAPPIST-1e,35.0,0,0.0,785.0,17.0,216.0,0.0,1,7,1
9,Europa,1,B/1/P,55 Cancri e,14.0,0,0.0,0.0,0.0,0.0,0.0,1,8,1


In [55]:
groups = train["_Group"].values
y = train["Transported"].values

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
train_idx, val_idx = next(gss.split(train, y, groups))

trn = train.iloc[train_idx].reset_index(drop=True)
val = train.iloc[val_idx].reset_index(drop=True)

print("Train shape:", trn.shape, " Valid shape:", val.shape)
print("Target mean (train):", trn["Transported"].mean().round(3))
print("Target mean (valid):", val["Transported"].mean().round(3))

Train shape: (6930, 14)  Valid shape: (1763, 14)
Target mean (train): 0.5
Target mean (valid): 0.518


In [56]:
spend_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

def process_spend_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df[spend_cols] = df[spend_cols].fillna(0)
    df.loc[df["CryoSleep"] == 1, spend_cols] = 0
    df["TotalSpend"] = df[spend_cols].sum(axis=1)
    df["LogTotalSpend"] = np.log1p(df["TotalSpend"])
    
    return df

In [57]:
train = process_spend_features(train)
test  = process_spend_features(test)

In [59]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,_Group,SeatNum,TotalSpend,LogTotalSpend
0,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,1,0.0,0.0
1,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,1,736.0,6.602588
2,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,1,10383.0,9.248021
3,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,2,5176.0,8.551981
4,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,1,1091.0,6.995766
