In [1]:
import crunch

crunch = crunch.load_notebook()

loaded inline runner with module: <module '__main__'>

cli version: 7.5.0
available ram: 31.71 gb
available cpu: 28 core
----


In [2]:
import numpy as np # == 2.1.2
import pandas as pd # == 2.3.2
import polars as pl # == 1.2.1
import os
import typing
import joblib # == 1.5.2
import sklearn # == 1.6.1
import lightgbm as lgb # == 4.6.0
from tabpfn import TabPFNClassifier # == 2.1.3
import shap # == 0.48.0
import scipy # == 1.16.1

from scipy.stats import f, ks_2samp, levene

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, ClassifierMixin, clone

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class FirstFeatureGenerator:
    def generate(self, X, suffix):
        return (
            X
            .group_by('id')
            .agg(
                pl.col('value').mean().alias('val_mean'),
                pl.col('value').median().alias('val_median'),
                pl.col('value').max().alias('val_max'),
                pl.col('value').min().alias('val_min'),
                pl.col('value').std().alias('val_std'),
                pl.col('value').skew().alias('val_skew'),
            )
            .with_columns(
                (pl.col('val_mean') / pl.col('val_std')).alias('mean_norm'),
                (pl.col('val_median') / pl.col('val_std')).alias('median_norm'),
            )
            .sort('id')
            .drop('id')
            .rename(lambda col: suffix + col)
        )

In [4]:
class SecondFeatureGenerator:
    def generate(self, X, suffix):
        def flatten_2d(lst):
            return [item for sublist in lst for item in sublist]

        return (
            X
            .with_columns(
                pl.col('value').alias('value_1'),
                ((pl.col('value') - pl.col('value').mean().over('id')) / pl.col('value').std().over('id')).alias(
                    'value_2'),
                pl.col('value').cum_sum().over('id').alias('value_3'),
                (pl.col('value').rank('dense').over('id') / pl.col('value').count().over('id')).alias('value_4'),
                pl.col('value').abs().alias('value_5'),
                pl.col('value').rolling_mean(16).over('id').alias('value_6'),
                pl.col('value').rolling_std(16).over('id').alias('value_7'),
            )
            .group_by('id')
            .agg(flatten_2d([[
                pl.col(f'value_{i}').mean().alias(f'val_mean_{i}'),
                pl.col(f'value_{i}').median().alias(f'val_median_{i}'),
                pl.col(f'value_{i}').max().alias(f'val_max_{i}'),
                pl.col(f'value_{i}').min().alias(f'val_min_{i}'),
                pl.col(f'value_{i}').std().alias(f'val_std_{i}'),
                pl.col(f'value_{i}').skew().alias(f'val_skew_{i}'),
            ] for i in [1, 2, 3, 4, 5, 6, 7]]))
            .with_columns(flatten_2d([[
                (pl.col(f'val_mean_{i}') / pl.col(f'val_std_{i}')).alias(f'mean_norm_{i}'),
                (pl.col(f'val_median_{i}') / pl.col(f'val_std_{i}')).alias(f'median_norm_{i}'),
            ] for i in [1, 2, 3, 4, 5, 6, 7]]))
            .sort('id')
            .drop('id')
            .rename(lambda col: suffix + col)
        )

In [5]:
class ThirdFeatureGenerator:
    def generate(self, X, suffix):
        def flatten_2d(lst):
            return [item for sublist in lst for item in sublist]

        X = (
            X
            .with_columns(
                pl.col('value').alias('value_1'),
                ((pl.col('value') - pl.col('value').mean().over('id')) / pl.col('value').std().over('id')).alias(
                    'value_2'),
                pl.col('value').cum_sum().over('id').alias('value_3'),
                (pl.col('value').rank('dense').over('id') / pl.col('value').count().over('id')).alias('value_4'),
                pl.col('value').abs().alias('value_5'),
                pl.col('value').rolling_mean(16).over('id').alias('value_6'),
                pl.col('value').rolling_std(16).over('id').alias('value_7'),
            )
            .with_columns(
                pl.col('value_1').shift(1).over('id').alias('val_shift_1'),
                pl.col('value_2').shift(1).over('id').alias('val_shift_2'),
                pl.col('value_3').shift(1).over('id').alias('val_shift_3'),
                pl.col('value_4').shift(1).over('id').alias('val_shift_4'),
                pl.col('value_5').shift(1).over('id').alias('val_shift_5'),
                pl.col('value_6').shift(1).over('id').alias('val_shift_6'),
                pl.col('value_7').shift(1).over('id').alias('val_shift_7'),
            )
        )

        return pl.concat([(
            pl.concat([
                X.filter(pl.col('period').eq(0)).group_by('id').tail(i),
                X.filter(pl.col('period').eq(0)).group_by('id').tail(i * 2).with_columns(
                    pl.lit(2).cast(pl.Int64).alias('period')),
                X.filter(pl.col('period').eq(0)).group_by('id').tail(i * 3).with_columns(
                    pl.lit(3).cast(pl.Int64).alias('period')),
                X.filter(pl.col('period').eq(1)).group_by('id').head(i),
            ])
            .group_by('id', 'period')
            .agg(flatten_2d([
                                pl.corr(f'value_{j}', f'val_shift_{j}').alias(f'val_corr_{j}_{i}'),

                                pl.col(f'value_{j}').quantile(0.25).alias(f'val_q25_{j}_{i}'),
                                pl.col(f'value_{j}').quantile(0.50).alias(f'val_q50_{j}_{i}'),
                                pl.col(f'value_{j}').quantile(0.75).alias(f'val_q75_{j}_{i}'),

                                pl.mean(f'value_{j}').alias(f'val_mean_{j}_{i}'),
                                pl.std(f'value_{j}').alias(f'val_std_{j}_{i}'),
                                pl.min(f'value_{j}').alias(f'val_min_{j}_{i}'),
                                pl.max(f'value_{j}').alias(f'val_max_{j}_{i}'),
                                pl.col(f'value_{j}').skew().alias(f'val_skew_{j}_{i}'),
                            ] for j in [1, 2, 3, 4, 5, 6, 7]))
            .pivot(
                index=['id'],
                on=['period'],
                values=flatten_2d([[f'val_mean_{j}_{i}', f'val_std_{j}_{i}', f'val_min_{j}_{i}', f'val_max_{j}_{i}',
                                    f'val_skew_{j}_{i}', f'val_q25_{j}_{i}', f'val_q50_{j}_{i}', f'val_q75_{j}_{i}',
                                    f'val_corr_{j}_{i}'] for j in [1, 2, 3, 4, 5, 6, 7]])
            )
            .with_columns(flatten_2d([
                                         (pl.col(f'val_mean_{j}_{i}_0') - pl.col(f'val_mean_{j}_{i}_1')).alias(
                                             f'val_mean_{j}_{i}_1_diff'),
                                         (pl.col(f'val_std_{j}_{i}_0') - pl.col(f'val_std_{j}_{i}_1')).alias(
                                             f'val_std_{j}_{i}_1_diff'),
                                         (pl.col(f'val_min_{j}_{i}_0') - pl.col(f'val_min_{j}_{i}_1')).alias(
                                             f'val_min_{j}_{i}_1_diff'),
                                         (pl.col(f'val_max_{j}_{i}_0') - pl.col(f'val_max_{j}_{i}_1')).alias(
                                             f'val_max_{j}_{i}_1_diff'),

                                         (pl.col(f'val_q25_{j}_{i}_0') - pl.col(f'val_q25_{j}_{i}_1')).alias(
                                             f'val_q25_{j}_{i}_1_diff'),
                                         (pl.col(f'val_q50_{j}_{i}_0') - pl.col(f'val_q50_{j}_{i}_1')).alias(
                                             f'val_q50_{j}_{i}_1_diff'),
                                         (pl.col(f'val_q75_{j}_{i}_0') - pl.col(f'val_q75_{j}_{i}_1')).alias(
                                             f'val_q75_{j}_{i}_1_diff'),

                                         (pl.col(f'val_mean_{j}_{i}_2') - pl.col(f'val_mean_{j}_{i}_1')).alias(
                                             f'val_mean_{j}_{i}_2_diff'),
                                         (pl.col(f'val_std_{j}_{i}_2') - pl.col(f'val_std_{j}_{i}_1')).alias(
                                             f'val_std_{j}_{i}_2_diff'),
                                         (pl.col(f'val_min_{j}_{i}_2') - pl.col(f'val_min_{j}_{i}_1')).alias(
                                             f'val_min_{j}_{i}_2_diff'),
                                         (pl.col(f'val_max_{j}_{i}_2') - pl.col(f'val_max_{j}_{i}_1')).alias(
                                             f'val_max_{j}_{i}_2_diff'),

                                         (pl.col(f'val_q25_{j}_{i}_2') - pl.col(f'val_q25_{j}_{i}_1')).alias(
                                             f'val_q25_{j}_{i}_2_diff'),
                                         (pl.col(f'val_q50_{j}_{i}_2') - pl.col(f'val_q50_{j}_{i}_1')).alias(
                                             f'val_q50_{j}_{i}_2_diff'),
                                         (pl.col(f'val_q75_{j}_{i}_2') - pl.col(f'val_q75_{j}_{i}_1')).alias(
                                             f'val_q75_{j}_{i}_2_diff'),

                                         (pl.col(f'val_mean_{j}_{i}_3') - pl.col(f'val_mean_{j}_{i}_1')).alias(
                                             f'val_mean_{j}_{i}_3_diff'),
                                         (pl.col(f'val_std_{j}_{i}_3') - pl.col(f'val_std_{j}_{i}_1')).alias(
                                             f'val_std_{j}_{i}_3_diff'),
                                         (pl.col(f'val_min_{j}_{i}_3') - pl.col(f'val_min_{j}_{i}_1')).alias(
                                             f'val_min_{j}_{i}_3_diff'),
                                         (pl.col(f'val_max_{j}_{i}_3') - pl.col(f'val_max_{j}_{i}_1')).alias(
                                             f'val_max_{j}_{i}_3_diff'),

                                         (pl.col(f'val_q25_{j}_{i}_3') - pl.col(f'val_q25_{j}_{i}_1')).alias(
                                             f'val_q25_{j}_{i}_3_diff'),
                                         (pl.col(f'val_q50_{j}_{i}_3') - pl.col(f'val_q50_{j}_{i}_1')).alias(
                                             f'val_q50_{j}_{i}_3_diff'),
                                         (pl.col(f'val_q75_{j}_{i}_3') - pl.col(f'val_q75_{j}_{i}_1')).alias(
                                             f'val_q75_{j}_{i}_3_diff'),

                                         (pl.col(f'val_corr_{j}_{i}_0') - pl.col(f'val_corr_{j}_{i}_1')).alias(
                                             f'val_corr_{j}_{i}_0_diff'),
                                         (pl.col(f'val_corr_{j}_{i}_2') - pl.col(f'val_corr_{j}_{i}_1')).alias(
                                             f'val_corr_{j}_{i}_2_diff'),
                                         (pl.col(f'val_corr_{j}_{i}_3') - pl.col(f'val_corr_{j}_{i}_1')).alias(
                                             f'val_corr_{j}_{i}_3_diff'),
                                     ] for j in [1, 2, 3, 4, 5, 6, 7]))
            .sort('id')
            .drop('id')
            .rename(lambda col: suffix + col)
        ) for i in [20, 60, 120, 500, 1000]], how='horizontal')

In [6]:
class FourthFeatureGenerator:
    def perform_hypothesis_test(self, df_group: pl.DataFrame) -> pl.DataFrame:
        id = df_group['id'].to_numpy()[0]
        a = df_group.filter(pl.col('period') == 0)['value'].to_numpy()
        b = df_group.filter(pl.col('period') == 1)['value'].to_numpy()

        n1, n2 = len(a), len(b)
        s1, s2 = np.var(a, ddof=1), np.var(b, ddof=1)

        f_statistic = s1 / s2
        f_p_value = 2 * min(f.cdf(f_statistic, n1-1, n2-1), 1 - f.cdf(f_statistic, n1-1, n2-1))

        levene_statistic, levene_p_value = levene(a, b)
        ks_statistic, ks_p_value = ks_2samp(a, b)

        return pl.DataFrame({
            'id': [id],
            'f_statistic': [f_statistic],
            'f_p_value': [f_p_value],
            'levene_statistic': [levene_statistic],
            'levene_p_value': [levene_p_value],
            'ks_statistic': [ks_statistic],
            'ks_p_value': [ks_p_value],
        })

    def generate(self, X, suffix):
        return (
            X
            .with_columns(pl.col('value').abs())
            .group_by('id').map_groups(self.perform_hypothesis_test)
            .sort('id')
            .drop('id')
            .rename(lambda col: suffix + col)
        )

In [7]:
def generate_features(X):
    gen1 = FirstFeatureGenerator()
    gen2 = SecondFeatureGenerator()
    gen3 = ThirdFeatureGenerator()
    gen4 = FourthFeatureGenerator()

    X = X.reset_index(level=1)
    ids = X.index.unique().tolist()
    X = pl.from_pandas(X.reset_index()).with_columns(pl.col('value').cast(pl.Float32))

    X1 = gen1.generate(X, 'step1')
    X2 = gen3.generate(X, 'step2')
    X3 = gen2.generate(X, 'step3')
    X4 = gen4.generate(X, 'step4')

    X1 = X1[sorted(X1.columns)]
    X2 = X2[sorted(X2.columns)]
    X3 = X3[sorted(X3.columns)]
    X4 = X4[sorted(X4.columns)]

    XS = pl.concat([X2, X3, X4], how='horizontal')

    XS = XS.to_pandas()
    X1 = X1.to_pandas()

    XS.index = ids
    X1.index = ids

    return X1, XS

In [8]:
def get_feature_importance(X, y):
    model = lgb.LGBMClassifier(
        n_estimators=750,
        learning_rate=0.01,
        colsample_bytree=0.3,
        max_depth=8,
        random_state=42,
        verbosity=-1,
    )
    model.fit(X, y)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    if isinstance(shap_values, list) and len(shap_values) == 2:
        shap_values = shap_values[1]

    shap_importance = np.abs(shap_values).mean(axis=0)
    shap_importance = pd.Series(shap_importance, index=X.columns).sort_values()
    shap_importance = shap_importance.index.tolist()

    gain_importance = model.booster_.feature_importance(importance_type='gain')
    gain_importance = pd.Series(gain_importance, index=X.columns).sort_values()
    gain_importance = gain_importance.index.tolist()

    return shap_importance, gain_importance

In [9]:
def build_gbdt(seed):
    return lgb.LGBMClassifier(
        n_estimators=5000,
        learning_rate=0.01,
        colsample_bytree=0.2,
        bagging_freq=4,
        bagging_fraction=0.8,
        max_depth=8,
        random_state=seed,
        verbosity=-1,
    )

In [10]:
class MultiInputClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, columns1, columns2):
        self.model1 = build_gbdt(12)
        self.model2 = build_gbdt(22)
        self.model3 = build_gbdt(32)
        self.model4 = build_gbdt(42)

        self.columns1 = columns1
        self.columns2 = columns2

    def fit(self, X, y):
        self.model1_ = clone(self.model1)
        self.model2_ = clone(self.model2)
        self.model3_ = clone(self.model3)
        self.model4_ = clone(self.model4)

        self.model1_.fit(X[self.columns1[-200:]], y)
        self.model2_.fit(X[self.columns2[-200:]], y)
        self.model3_.fit(X[self.columns1[-500:]], y)
        self.model4_.fit(X[self.columns2[-500:]], y)

        return self

    def predict(self, X):
        pred1 = self.model1_.predict(X[self.columns1[-200:]])
        pred2 = self.model2_.predict(X[self.columns2[-200:]])
        pred3 = self.model3_.predict(X[self.columns1[-500:]])
        pred4 = self.model4_.predict(X[self.columns2[-500:]])

        preds = np.vstack([pred1, pred2, pred3, pred4])

        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=preds)

    def predict_proba(self, X):
        proba1 = self.model1_.predict_proba(X[self.columns1[-200:]])
        proba2 = self.model2_.predict_proba(X[self.columns2[-200:]])
        proba3 = self.model3_.predict_proba(X[self.columns1[-500:]])
        proba4 = self.model4_.predict_proba(X[self.columns2[-500:]])

        return (proba1 + proba2 + proba3 + proba4) / 4

In [11]:
def train(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    model_directory_path: str,
):
    X1, X2 = generate_features(X_train)

    model_tabpfn = [
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_0.joblib')),
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_1.joblib')),
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_2.joblib')),
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_3.joblib')),
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_4.joblib')),
    ]

    pred_stack = []
    cv = KFold(5, shuffle=True, random_state=42)
    for i, (idx_train, idx_valid) in enumerate(cv.split(X1, y_train)):
        model_tabpfn[i].fit(X1.iloc[idx_train], y_train.iloc[idx_train])
        joblib.dump(model_tabpfn[i], os.path.join(model_directory_path, f'model_tabpfn_{i}.joblib'))
        pred = model_tabpfn[i].predict_proba(X1.iloc[idx_valid])[:, 1]
        pred = pd.DataFrame([pred]).T
        pred.index = y_train.iloc[idx_valid].index
        pred_stack.append(pred)

    X = pd.concat(pred_stack, axis=0).sort_index()
    X = X.add_prefix('col_')
    X = pd.concat([X2, X], axis=1)

    shap_imp, gain_imp = get_feature_importance(X, y_train)

    model_gbdt = MultiInputClassifier(
        columns1=shap_imp, columns2=gain_imp,
    )

    model_gbdt.fit(X, y_train)
    joblib.dump(model_gbdt, os.path.join(model_directory_path, 'model_gbdt.joblib'))

In [12]:
def infer(
    X_test: typing.Iterable[pd.DataFrame],
    model_directory_path: str,
):
    model_tabpfn = [
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_0.joblib')),
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_1.joblib')),
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_2.joblib')),
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_3.joblib')),
        joblib.load(os.path.join(model_directory_path, 'model_tabpfn_4.joblib')),
    ]
    model_gbdt = joblib.load(os.path.join(model_directory_path, 'model_gbdt.joblib'))

    yield  # Mark as ready

    for dataset in X_test:
        X1, X2 = generate_features(dataset)

        X = pd.DataFrame([np.mean([model.predict_proba(X1)[:, 1] for model in model_tabpfn])]).T
        X.index = X1.index
        X = X.sort_index()
        X = X.add_prefix('col_')
        X = pd.concat([X2, X], axis=1)

        prediction = model_gbdt.predict_proba(X)[:, 1]
        yield prediction  # Send the prediction for the current dataset

In [13]:
crunch.test(
    # Uncomment to disable the train
    force_first_train=False,

    # Uncomment to disable the determinism check
    # no_determinism_check=True,
)

[32m23:13:11[0m [33mno forbidden library found[0m
[32m23:13:11[0m [33m[0m
[32m23:13:12[0m started
[32m23:13:12[0m running local test
[32m23:13:12[0m [33minternet access isn't restricted, no check will be done[0m
[32m23:13:12[0m 
[32m23:13:13[0m starting unstructured loop...
[32m23:13:13[0m executing - command=train


data\X_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_train.parquet (204327238 bytes)
data\X_train.parquet: already exists, file length match
data\X_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/X_test.reduced.parquet (2380918 bytes)
data\X_test.reduced.parquet: already exists, file length match
data\y_train.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_train.parquet (61003 bytes)
data\y_train.parquet: already exists, file length match
data\y_test.reduced.parquet: download from https:crunchdao--competition--production.s3-accelerate.amazonaws.com/data-releases/146/y_test.reduced.parquet (2655 bytes)
data\y_test.reduced.parquet: already exists, file length match


[32m23:17:50[0m executing - command=infer
[32m23:19:03[0m checking determinism by executing the inference again with 30% of the data (tolerance: 1e-08)
[32m23:19:03[0m executing - command=infer
[32m23:19:25[0m determinism check: passed
[32m23:19:25[0m [33msave prediction - path=data\prediction.parquet[0m
[32m23:19:25[0m ended
[32m23:19:25[0m [33mduration - time=00:06:13[0m
[32m23:19:25[0m [33mmemory - before="713.8 MB" after="10.11 GB" consumed="9.39 GB"[0m
