In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# options
TRAINING = False
TRAINING_HS = False
SUBMISSION = True
RANDOM_STATE = 42

if RANDOM_STATE:
    np.random.seed(RANDOM_STATE)  # deprecated, but sklearn is horrible :-(

In [None]:
import sys
print(sys.version)

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
github_pat = user_secrets.get_secret("github")
url = f"https://acciochris:{github_pat}@github.com/acciochris/kaggle.git"
del github_pat
!git clone -q {url} --depth=1
del url

In [None]:
%pip install ./kaggle/spaceship

In [None]:
from sklearn.utils import parallel_backend as parallel

def backend(name: str) -> str:
    """Get parallel backend for estimator"""
    return "loky"

In [None]:
from spaceship import *
import polars as pl

TRAIN = "/kaggle/input/spaceship-titanic/train.csv"
df = pl.read_csv(TRAIN)
df.head()

In [None]:
def polars_preprocess(df):
    return df.with_columns(
        pl.col("Cabin").str.split("/").list.to_struct(
            fields=["CabinDeck", "CabinNum", "CabinSide"]
        ),
        pl.col("Name").str.split(" ").list.to_struct(
            fields=["FirstName", "LastName"]
        ),
        pl.col("PassengerId").str.split("_").list.to_struct(
            fields=["GroupId", "IdInGroup"]
        ),
    ).unnest("Cabin", "Name", "PassengerId").with_columns(
        pl.col("CabinNum").cast(pl.Int32),
        pl.col("GroupId").cast(pl.Int32),
        pl.col("IdInGroup").cast(pl.Int32),
        pl.col("CryoSleep").cast(pl.Int32),
        pl.col("VIP").cast(pl.Int32),
        # pl.col("Transported").cast(pl.Int32)
    )

parsed = polars_preprocess(df)

In [None]:
data = parsed.to_pandas()
cols = parsed.columns
data.head()

In [None]:
print(*cols)

In [None]:
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
)
from sklearn.impute import SimpleImputer
import sklearn

print(sklearn.__version__)
sklearn.set_config(transform_output="pandas")

In [None]:
imputer_cols = {
    "id": "GroupId IdInGroup".split(),
    "cat": (
        "HomePlanet CryoSleep CabinDeck CabinNum CabinSide "
        "Destination VIP"  # first and last names dropped
    ).split(),
    "num": "Age RoomService FoodCourt ShoppingMall Spa VRDeck".split(),
}

imputer = ColumnTransformer(
    [
        ("id", "passthrough", imputer_cols["id"]),
        ("cat", SimpleImputer(strategy="most_frequent"), imputer_cols["cat"]),
        ("num", SimpleImputer(strategy="median"), imputer_cols["num"]),
    ],
    remainder="drop",  # output column Transported also dropped
    verbose=True,
    verbose_feature_names_out=False,
)

encoder_cols = {
    "one_hot": "HomePlanet CabinDeck CabinSide Destination".split(),
}

encoder = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False,  # required for pandas dataframes
        ), encoder_cols["one_hot"])
    ],
    remainder="passthrough",
    verbose=True,
    verbose_feature_names_out=False,
)

scaler_cols = {
    "standard": (
        "GroupId CabinNum Age RoomService FoodCourt ShoppingMall Spa VRDeck"
    ).split(),
}

scaler = ColumnTransformer(
    [
        ("standard", StandardScaler(), scaler_cols["standard"])
    ],
    remainder="passthrough",
    verbose=True,
    verbose_feature_names_out=False,
)

preprocessor = Pipeline(
    [
        ("imputer", imputer),
        ("encoder", encoder),
        ("scaler", scaler),
    ],
    verbose=True,
)

In [None]:
X = preprocessor.fit_transform(data)
y = data.Transported
columns = X.columns
print(*columns)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)

In [None]:
### Feature extraction
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

pca_lda_cols = {
    "billing": "RoomService FoodCourt ShoppingMall Spa VRDeck".split(),
    "home": "HomePlanet_Earth HomePlanet_Europa HomePlanet_Mars HomePlanet_None".split(),
    "cabin": (
        "CabinDeck_A CabinDeck_B CabinDeck_C CabinDeck_D CabinDeck_E CabinDeck_F "
        "CabinDeck_G CabinDeck_T CabinDeck_None CabinSide_P CabinSide_S CabinSide_None"
    ).split(),
    "destination": (
        "Destination_55 Cancri e//Destination_PSO J318.5-22//Destination_TRAPPIST-1e//Destination_None"
    ).split("//")
}

# FIXME: for some reason LDA does not work on the full training dataset
# so needs more investigation, temporarily using PCA instead
pca_lda = ColumnTransformer(
    [
        ("billing", PCA(n_components=3), pca_lda_cols["billing"]),
    #   ("home", LDA(n_components=1), pca_lda_cols["home"]),
        ("home", PCA(n_components=1), pca_lda_cols["home"]),
        ("cabin", PCA(n_components=3), pca_lda_cols["cabin"]),
    #   ("destination", LDA(n_components=1), pca_lda_cols["destination"]),
        ("destination", PCA(n_components=1), pca_lda_cols["destination"]),
    ],
    remainder="passthrough",
    verbose=False,
    verbose_feature_names_out=True,  # so that the names do not collide
)

In [None]:
X_train_pca_lda = pca_lda.fit_transform(X_train, y_train)
X_test_pca_lda = pca_lda.transform(X_test)

In [None]:
X_train_pca_lda.head()

In [None]:
for _, estimator, _ in pca_lda.transformers_[:-1]:
    print(estimator.explained_variance_ratio_)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

logistic = LogisticRegression()
forest = RandomForestClassifier()
knn = KNeighborsClassifier()

In [None]:
import sklearn.base

logistic_col = make_pipeline(sklearn.base.clone(pca_lda), logistic)
knn_col = make_pipeline(sklearn.base.clone(pca_lda), knn)

In [None]:
from pprint import pprint

pprint(logistic_col.get_params())
print(); print()
pprint(knn_col.get_params())
print(); print()
pprint(forest.get_params())

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
import scipy.stats

param_grid = {
    "logistic": [
        {'logisticregression__C': scipy.stats.loguniform(0.001, 100)},
        {
            'logisticregression__penalty': ['l1'],
            'logisticregression__solver': ['liblinear'],
            'logisticregression__C': scipy.stats.loguniform(0.001, 100),
        }
    ],
    "forest": {
        "n_estimators": scipy.stats.randint(100, 1000),
        "max_depth": scipy.stats.randint(1, 5),
    },
    "knn": {
        'kneighborsclassifier__n_neighbors': scipy.stats.randint(1, 10),
        'kneighborsclassifier__weights': ["uniform", "distance"]
    }
}

hs_kwargs = dict(factor=3, n_jobs=-1, verbose=0)

hs_log = HalvingRandomSearchCV(logistic_col, param_grid["logistic"], **hs_kwargs)
hs_forest = HalvingRandomSearchCV(forest, param_grid["forest"], **hs_kwargs)
hs_knn = HalvingRandomSearchCV(knn_col, param_grid["knn"], **hs_kwargs)

In [None]:
def estimators(hs: bool = False):
    est = [hs_log, hs_forest, hs_knn]
    
    if not hs:
        # fall back to the untrained version if necessary
        try:
            est = [sklearn.base.clone(hs.best_estimator_) for hs in est]
        except AttributeError:
            est = [logistic_col, forest, knn_col]
    
    return zip(
        ["logistic", "forest", "knn"],
        est,
    )

if TRAINING_HS:
    for name, hs in estimators(hs=True):
        print(f"Fitting {name}")
        with parallel(backend(name)):
            hs.fit(X_train, y_train)


In [None]:
if TRAINING_HS:
    for name, hs in estimators(hs=True):
        print(f"Result for {name}:")
        print(hs.best_score_)
        print(hs.best_params_)
        print()

In [None]:
# according to one run

"""
Result for logistic:
0.7362721850562073
{'logisticregression__C': 1.1365231628723127}

Result for forest:
0.7521007071725319
{'max_depth': 4, 'n_estimators': 417}

Result for knn:
0.733344406769306
{'kneighborsclassifier__n_neighbors': 6, 'kneighborsclassifier__weights': 'distance'}
"""

# commented out since we recover best_estimator_ from hs itself
# logistic_col.set_params(**{'logisticregression__C': 1.1365231628723127})
# forest.set_params(**{'max_depth': 4, 'n_estimators': 417})
# knn_col.set_params(**{'kneighborsclassifier__n_neighbors': 6, 'kneighborsclassifier__weights': 'distance'})

# however we reassign them, just in case
logistic_col, forest, knn_col = [est for _, est in estimators()]

In [None]:
if TRAINING:
    for _, estimator in estimators():
        estimator.fit(X_train, y_train)

In [None]:
if TRAINING:
    y_preds = {
        name: estimator.predict(X_test)
        for name, estimator in estimators()
    }

    y_preds_proba = {
        name: estimator.predict_proba(X_test)
        for name, estimator in estimators()
    }

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    ConfusionMatrixDisplay,
)

%matplotlib inline

def evaluate(name, y_test, y_pred):
    print(f"Result for {name}:")
    print(f"precision: {precision_score(y_test, y_pred)}")
    print(f"recall: {recall_score(y_test, y_pred)}")
    print(f"f1: {f1_score(y_test, y_pred)}")
    print(f"matthews: {matthews_corrcoef(y_test, y_pred)}")
    plt.close()
    confusion = ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    confusion.plot()

if TRAINING:
    for name, y_pred in y_preds.items():
        evaluate(name, y_test, y_pred)

    plt.close()

In [None]:
if TRAINING:
    probas = np.array(list(y_preds_proba.values()))
    print(probas)
    print("Mean:")
    print(np.mean(probas, axis=1))
    print("Std:")
    print(np.std(probas, axis=1))

In [None]:
from sklearn.ensemble import VotingClassifier

voter = VotingClassifier(
    list(estimators()),
    voting="soft",
    # weights=[0.36, 0.36, 0.28],  # commented out since forest and logistic overfit
)

if TRAINING:
    voter.fit(X_train, y_train)
    y_pred_voter = voter.predict(X_test)
    evaluate("voter", y_test, y_pred_voter)

In [None]:
# submission

if SUBMISSION:
    TEST = "/kaggle/input/spaceship-titanic/test.csv"
    submit_df = pl.read_csv(TEST)
    submit_data = polars_preprocess(submit_df).to_pandas()
    X_submit = preprocessor.transform(submit_data)
    
    # first figure out best hyperparameters
    for name, hs in estimators(hs=True):
        print(f"Fitting {name}")
        with parallel(backend(name)):
            hs.fit(X, y)

    final = VotingClassifier(
        list(estimators()),
        voting="soft",
    )

    final.fit(X, y)
    y_submit = final.predict(X_submit)
    submission = pd.DataFrame({
        "PassengerId": submit_df.get_column("PassengerId").to_numpy(),
        "Transported": map(bool, y_submit)
    })
    submission.to_csv("submission.csv", index=False)