In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import (
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

from src.load import load_data
from src.utils import save

%load_ext autoreload
%autoreload 2

In [28]:
"""
Inspired by Franck Zibi's model :
Level 0 : Extract features from the series of data
Level 1 : Base classification with GradientBoostingClassifier
Level 2 : Residual estimation of the probabilities with ensemble of models
    - For each category
        - GradientBoostingRegressor
        - RandomForestRegressor
        - KNeighborsRegressor
        - MLPRegressor
        - SVR
Level 3 : Stacking of the residual estimations
    - For each category
        - StackingRegressor with GradientBoostingRegressor
Level 4 : Nothing for now, but calibration on the test set could be done
"""

# Level 1
base_classifier = GradientBoostingClassifier()

# Level 2
regressors = []
for _ in range(24):
    regressors.append(
        [
            ("GradientBoostingRegressor", GradientBoostingRegressor()),
            ("RandomForestRegressor", RandomForestRegressor()),
            ("KNeighborsRegressor", KNeighborsRegressor()),
            ("MLPRegressor", MLPRegressor()),
            ("SVR", SVR()),
        ]
    )

# Level 3
stacking_regressors = []
for k in range(24):
    stacking_regressors.append(
        StackingRegressor(
            estimators=regressors[k],
            final_estimator=GradientBoostingRegressor(),
        )
    )

In [15]:
def load_features(model_idx):

    X_train = pd.read_parquet(f"features/model {model_idx}/features_train.parquet")
    y_train = pd.read_parquet("data/y_train.parquet")
    X_test = pd.read_parquet(f"features/model {model_idx}/features_test.parquet")

    X_train = X_train.values[:, 1:]
    y_train = y_train.values[:,1]
    X_test = X_test.values[:, 1:]
    return X_train, y_train, X_test

In [26]:
model_idx = 1

X_train, y_train, X_test = load_features(1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(144720, 44) (16080, 44) (144720,) (16080,)


In [29]:
one_hot_y_val = np.eye(np.max(y_val) + 1)[y_val]
one_hot_y_train = np.eye(np.max(y_train) + 1)[y_train]

print("Training base classifier...")
base_classifier.fit(X_train, y_train)

y_train_residuals = one_hot_y_train - base_classifier.predict_proba(X_train)

for k in range(24):
    print(f"Training regressor {k}...")
    stacking_regressors[k].fit(X_train, y_train_residuals[:, k])

Training base classifier...


KeyboardInterrupt: 

In [None]:
with open("runs/simple/base_classifier.pkl", "wb") as f:
    pickle.dump(base_classifier, f)

for k in range(24):
    with open(f"runs/simple/stacking_regressor_{k}.pkl", "wb") as f:
        pickle.dump(stacking_regressors[k], f)

In [None]:
with open("runs/simple/base_classifier.pkl", "rb") as f:
    base_classifier = pickle.load(f)

for k in range(24):
    with open(f"runs/simple/stacking_regressor_{k}.pkl", "rb") as f:
        stacking_regressors[k] = pickle.load(f)

In [None]:
y_val_base_pred = base_classifier.predict_proba(X_val)
y_val_residuals = np.zeros_like(y_val_base_pred)
for k in range(24):
    y_val_residuals[:, k] = stacking_regressors[k].predict(X_val)
y_val_pred = y_val_base_pred + y_val_residuals
y_val_pred = np.argmax(y_val_pred, axis=1)

print("Validation accuracy:", (y_val_pred == y_val).mean())

In [None]:
y_test_base_pred = base_classifier.predict_proba(X_test)
y_test_residuals = np.zeros_like(y_test_base_pred)
for k in range(24):
    y_test_residuals[:, k] = stacking_regressors[k].predict(X_test)
y_test_pred = y_test_base_pred + y_test_residuals
y_test_pred = np.argmax(y_test_pred, axis=1)

save(y_test_pred)