In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

# change current working directory to the root of the project
import os
os.chdir(os.path.dirname(os.getcwd()))

# Purpose
- Purpose of this notebook is to build an sklearn pipeline to preprocess the data

In [2]:
import warnings
import gc
from IPython.display import display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from src.pipeline_transformers import ColumnDropperTransformer


# Load Data

In [6]:
nrows = None
X_train = pd.read_csv("data/transformed/X_train.csv", nrows=nrows)
X_val = pd.read_csv("data/transformed/X_val.csv", nrows=nrows)

y_train = pd.read_csv("data/transformed/y_train.csv", nrows=nrows)
y_train = y_train.rename(columns={"0": "activity"})
y_val = pd.read_csv("data/transformed/y_val.csv", nrows=nrows)
y_val = y_val.rename(columns={"0": "activity"})

display(X_val.head())
display(y_val.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
0,0.283203,-0.047024,-0.168986,0.384949,0.176898,-0.310332,0.381757,0.122611,-0.332984,0.465563,...,0.454181,-0.730593,-0.932568,-0.034924,0.558036,0.258975,-0.854858,-0.78433,0.22296,-0.066506
1,0.256904,-0.036623,-0.133856,0.201409,-0.154142,0.344183,0.092771,-0.182114,0.291899,0.586004,...,0.190745,-0.032958,-0.338095,0.017986,-0.475545,0.942947,-0.567147,-0.665156,0.178975,0.243362
2,0.291316,-0.001065,-0.072461,-0.336609,-0.279162,-0.303323,-0.381421,-0.253026,-0.337381,0.07443,...,0.299767,-0.351997,-0.698254,-0.044663,-0.551806,-0.680774,0.161405,-0.857202,0.192929,0.046716
3,0.276116,-0.010909,-0.102886,-0.992196,-0.982169,-0.981127,-0.992838,-0.981366,-0.979864,-0.931924,...,0.335848,-0.623358,-0.906098,-0.177275,-0.309304,0.176515,-0.143818,-0.625215,-0.106678,-0.147469
4,0.256382,0.000428,-0.113664,0.075014,0.046502,-0.369482,-0.024794,-0.085337,-0.385018,0.56223,...,-0.125979,0.02063,-0.316113,0.347143,0.590475,0.831084,0.465453,-0.860617,0.150598,-0.067109


Unnamed: 0,activity
0,3
1,3
2,1
3,4
4,3


In [8]:
# dtypes
X_train.dtypes.value_counts()

float64    561
dtype: int64

# Build pipeline

In [9]:
pipeline = make_pipeline(
    StandardScaler()
    )

# Hyper-parameter Tuning

In [None]:
# Define hyper-parameters & objective function
# define objective function
def objective(trial: optuna.trial.Trial) -> float:
    '''Takes in hyperparameters as input, and trains a model that computes the average validation error based on KFold cross validation'''

    # define hyperparameters
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }

    k_fold = KFold(n_splits=5)
    scores = []
    for train_index, val_index in k_fold.split(X_train):
        # split data
        X_train_, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        # create model
        pipeline = custom_pipeline(**params)

        # fit model
        pipeline.fit(X_train_, y_train_)

        # compute validation error
        y_pred = pipeline.predict_proba(X_val)[:,1]
        brier_score = brier_score_loss(y_val, y_pred)

        scores.append(brier_score)
    
    return np.mean(scores)

In [None]:
warnings.filterwarnings('ignore')

# optuna study
study = optuna.create_study(direction='minimize', study_name='calibrated_lgbm')
study.optimize(objective, n_trials=5)

In [None]:
# print best parameters
best_params = study.best_trial.params
print(f'{best_params=}')