Learning [resource](https://www.kaggle.com/code/robikscube/cross-validation-visualized-youtube-tutorial/input)

### imports

In [19]:
%pip install --quiet pandas numpy lightgbm scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    train_test_split,
    TimeSeriesSplit,
    KFold,
    StratifiedKFold,
    GroupKFold,
    StratifiedGroupKFold,
)

### utils

In [2]:
def preprocessing(df):
    # fill missing values with Missing
    cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
    df[cols] = df[cols].fillna("Missing")
    
    # split cabin to deck, num, slide
    df[["Cabin.Deck", "Cabin.Num", "Cabin.Slide"]] = df["Cabin"].str.split("/", n=2, expand=True)
    df.drop(["Cabin", "Cabin.Num"], axis=1, inplace=True)
    
    cols = ["Cabin.Deck", "Cabin.Slide"]
    df[cols] = df[cols].fillna("Missing")
    
    # fill missing values with mean
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    
    # fill missing values with 0
    cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df[cols] = df[cols].fillna(0)
    
    # drop name due to high cardinality
    df.drop("Name", axis=1, inplace=True) 
    
    df["Transported"] = df["Transported"].astype(int)

In [17]:
def get_prep_data(df):
    # apply preprocessing
    preprocessing(df)
    
    # factorize
    for colname in df.select_dtypes("object"):
        df[colname], _ = df[colname].factorize()
    
    # sample  
    holdout_ids = df.sample(n=800, random_state=66).index
    
    # split
    train = (
        df.loc[~df.index.isin(holdout_ids)]
        .sample(frac=1, random_state=66)
        .sort_values("HomePlanet")
        .reset_index(drop=True)
    )
    holdout = (
        df.loc[df.index.isin(holdout_ids)]
        .sample(frac=1, random_state=529)
        .sort_values("HomePlanet")
        .reset_index(drop=True)
    )

    return train, holdout

In [18]:
def get_X_y(df):
    FEATURES = [
        "CryoSleep",
        "Cabin.Deck", 
        "Cabin.Slide",
        "Destination",
        "Age",
        "VIP",
        "RoomService",
        "FoodCourt",
        "ShoppingMall",
        "Spa",
        "VRDeck"
    ]
    GROUPS = "HomePlanet"
    TARGET = "Transported"

    X = df[FEATURES]
    y = df[TARGET]
    groups = df[GROUPS]
    
    return X, y, groups

### read data

In [6]:
# read data
df = pd.read_csv("dataset/spaceship-titanic-train.csv")
train, holdout = get_prep_data(df)

print(f"train: {train.shape}")
print(f"holdout: {holdout.shape}")

train: (7893, 14)
holdout: (800, 14)


### get-started (no split)

In [7]:
# train
X, y, groups = get_X_y(train)
clf = lgb.LGBMClassifier(n_estimators=100)
clf.fit(X, y)

[LightGBM] [Info] Number of positive: 3969, number of negative: 3924
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000582 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1377
[LightGBM] [Info] Number of data points in the train set: 7893, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502851 -> initscore=0.011403
[LightGBM] [Info] Start training from score 0.011403


In [8]:
# predict on training set
pred = clf.predict(X)
pred_prob = clf.predict_proba(X)[:, 1]

acc_score = accuracy_score(y, pred)
auc_score = roc_auc_score(y, pred_prob)

print(f'The score on the training set is accuracy: {acc_score:0.4f} and AUC of {auc_score:0.4f}')

The score on the training set is accuracy: 0.8571 and AUC of 0.9444


In [9]:
# evaluate on holdout
X_holdout, y_holdout, groups_holdout = get_X_y(holdout)

pred = clf.predict(X_holdout)
pred_prob = clf.predict_proba(X_holdout)[:, 1]

acc_score = accuracy_score(y_holdout, pred)
auc_score = roc_auc_score(y_holdout, pred_prob)

print(f"Our accuracy on the holdout set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")

Our accuracy on the holdout set is 0.7812 and AUC is 0.8807


In [10]:
# baseline
acc_score = accuracy_score(y_holdout, np.zeros_like(y_holdout))
auc_score = roc_auc_score(y_holdout, np.zeros_like(y_holdout))

print(f"Our baseline on the holdout set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")

Our baseline on the holdout set is 0.4888 and AUC is 0.5000


### train-test-split

In [11]:
# get data
X, y, groups = get_X_y(train)

X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1)

In [12]:
# train
clf = lgb.LGBMClassifier(n_estimators=100, max_depth=3)
clf.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 3590, number of negative: 3513
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1377
[LightGBM] [Info] Number of data points in the train set: 7103, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505420 -> initscore=0.021682
[LightGBM] [Info] Start training from score 0.021682


In [13]:
# evaluate on valid split
pred = clf.predict(X_val)
pred_prob = clf.predict_proba(X_val)[:, 1]

acc_score = accuracy_score(y_val, pred)
auc_score = roc_auc_score(y_val, pred_prob)

print(f"Our accuracy on the validation set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")

Our accuracy on the validation set is 0.8051 and AUC is 0.8909


In [14]:
# evaluate on holdout set
X_holdout, y_holdout, groups_holdout = get_X_y(holdout)

pred = clf.predict(X_holdout)
pred_prob = clf.predict_proba(X_holdout)[:, 1]

acc_score = accuracy_score(y_holdout, pred)
auc_score = roc_auc_score(y_holdout, pred_prob)

print(f"Our accuracy on the holdout set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")

Our accuracy on the holdout set is 0.8013 and AUC is 0.8816


### stratified-k-fold

In [15]:
# read data
X, y, groups = get_X_y(train)

In [16]:
# train
sgk = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=529)

fold = 0
aucs = []
for train_idx, val_idx in sgk.split(X, y, groups):
    print(f"\n======= Fold {fold} ========")
    # get data
    X_tr, y_tr = X.loc[train_idx], y.loc[train_idx]
    X_val, y_val = X.loc[val_idx], y.loc[val_idx]
    
    # fit model
    clf = lgb.LGBMClassifier(n_estimators=100)
    clf.fit(X_tr, y_tr)
    
    # preditct
    pred = clf.predict(X_val)
    pred_prob = clf.predict_proba(X_val)[:, 1]
    
    # evaluate
    acc_score = accuracy_score(y_val, pred)
    auc_score = roc_auc_score(y_val, pred_prob)
    
    print(f"-"*25)
    print(f"Our accuracy on the validation set is {acc_score:0.4f} and AUC is {auc_score:0.4f}")
    print(f"-"*25)
    fold += 1
    aucs.append(auc_score)

oof_auc = np.mean(aucs)
print(f'Our out of fold AUC score is {oof_auc:0.4f}')


[LightGBM] [Info] Number of positive: 3876, number of negative: 3835
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1377
[LightGBM] [Info] Number of data points in the train set: 7711, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502659 -> initscore=0.010634
[LightGBM] [Info] Start training from score 0.010634
-------------------------
Our accuracy on the validation set is 0.7802 and AUC is 0.8489
-------------------------

[LightGBM] [Info] Number of positive: 2692, number of negative: 3271
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1374
[