In [75]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import xgboost as xgb
import lightgbm as lgbm
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from functools import partial
from openfe import OpenFE, transform
import warnings

warnings.filterwarnings('ignore')


In [76]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [77]:
import train_tabular_utils as tt
import cv_split_utils
import enums
import data_utils

In [78]:
class Config:
    RUN_MODE = "LOCAL"
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "FloodProbability"        
    SCALER = enums.Scaler.StandardScaler
    METRIC = enums.Metrics.R2
    # These values are more dynamic   
    MODEL_TYPE = enums.ModelName.Ridge    
    NUM_TUNING_TRIALS = 25
    TUNE_ON_SINGLE_FOLD = True
    TRAIN_SINGLE_FOLD = False
    GENERATE_AUTO_FEATURES = False
    PERSIST_MODEL = False
    TRANSFORM_TARGET = False

COLS_TO_LEAVE = ["FloodProbability", "kfold"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
DATA_WRITEPATH = "./output/"
SUBMISSION_FILEPATH = DATA_READPATH
if Config.RUN_MODE == "KAGGLE":
    # If we are not generating features, we are using already generated features
    if Config.GENERATE_AUTO_FEATURES:
        DATA_READPATH = "/kaggle/input/playground-series-s4e4/"
        SUBMISSION_FILEPATH = DATA_READPATH
    else:
        DATA_READPATH = "/kaggle/input/abalone-openfe/"
        SUBMISSION_FILEPATH = "/kaggle/input/playground-series-s4e4/"
    DATA_WRITEPATH = "/kaggle/working/"

In [79]:
tuned_model_params = None

In [80]:
df_train = pd.read_csv("./data/train.csv", index_col='id')
df_test = pd.read_csv("./data/test.csv", index_col='id')
# keep a copy of original train and test data for later use
df_train_orig = df_train.copy()
df_test_orig = df_test.copy()

In [81]:
feature_cols_for_fe = df_test.columns.to_list()

In [82]:
def generate_new_features(df_train, df_test, feature_cols, NUM_NEW_FEATURES=10):
    train_X = df_train[feature_cols] 
    test_X = df_test[feature_cols]   
    train_y = df_train[Config.TARGET_COL_NAME]
    ofe = OpenFE()
    features = ofe.fit(data=train_X, label=train_y, n_jobs=CPU_COUNT, verbose=False)  # generate new features
    # OpenFE recommends a list of new features. We include the top 10
    # generated features to see how they influence the model performance
    train_X, test_X = transform(train_X, test_X, ofe.new_features_list[:NUM_NEW_FEATURES], n_jobs=CPU_COUNT)
    return train_X, test_X

In [83]:
if Config.GENERATE_AUTO_FEATURES:
    df_train, df_test = generate_new_features(df_train, df_test, feature_cols_for_fe)    
    df_train_labels = df_train_orig[[Config.TARGET_COL_NAME]]
    # Add the label data to the dataframe
    df_train = pd.concat([df_train, df_train_labels], axis=1)
    # save the new train and test data with openfe features to csv files for later use
    df_train.to_csv(DATA_WRITEPATH + "train_openfe.csv", index=False)
    df_test.to_csv(DATA_WRITEPATH + "test_openfe.csv", index=False)

In [84]:
feature_cols = df_test.columns.to_list()

In [85]:
df_train = cv_split_utils.kfold_dataframe(df_train, random_state=Config.RANDOM_SEED, num_folds=Config.NUM_FOLDS)

In [86]:
# one hot encoding of categorical variables
df_train = pd.get_dummies(df_train, columns=feature_cols)
df_test = pd.get_dummies(df_test, columns=feature_cols)

In [87]:
feature_cols= [x for x in df_train.columns.to_list() if x not in COLS_TO_LEAVE]
print(f"len(feature_cols)={len(feature_cols)}")

len(feature_cols)=355


In [88]:
def get_model_tuning_params(trial, model_name):
    if model_name == enums.ModelName.Ridge:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.Lasso:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.RandomForest:
        return {        
            "n_estimators": trial.suggest_int("n_estimators", 400, 3000, step=100),
            "max_depth": trial.suggest_int("max_depth", 10, 30),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 16),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
            "max_features": trial.suggest_categorical("max_features", ["log2", "sqrt", None])
        }

In [89]:
def hyperparams_tuning_objective(trial, model_name, df_train,  
                                 feature_cols, metric, target_col_name, single_fold=False,
                                 num_folds=5, val_preds_col="val_preds"):           
    model_params = get_model_tuning_params(trial, model_name)    
    fold_metrics_model, df_val_preds = tt.run_training(
        model_name=model_name,
        df_train=df_train,
        target_col_name=target_col_name,
        feature_col_names=feature_cols,
        metric=metric,            
        num_folds=num_folds,
        model_params=model_params,
        val_preds_col=val_preds_col,
        single_fold=single_fold,
        suppress_print=True,
        transform_target=Config.TRANSFORM_TARGET
    )       
    fold_metrics = [x[0] for x in fold_metrics_model]
    mean_metric = statistics.mean(fold_metrics)                
    return mean_metric

In [90]:
def tune_model_params(study_name, study_direction, num_trials, model_name, 
                      df_train,  feature_cols, metric, target_col_name, 
                      single_fold=False, num_folds=5, val_preds_col="val_preds"):
    model_params_tuning_obj_partial = partial(
        hyperparams_tuning_objective,
        model_name=model_name,        
        df_train=df_train,
        feature_cols=feature_cols,
        metric=metric,
        target_col_name=target_col_name,
        single_fold=single_fold,
        num_folds=num_folds,
        val_preds_col=val_preds_col
    )
    study = optuna.create_study(direction=study_direction, study_name=study_name)
    study.optimize(model_params_tuning_obj_partial, n_trials=num_trials)
    best_trial = study.best_trial
    print(f"Best trial: number = {best_trial.number}, value = {best_trial.value}, params = {best_trial.params}")
    return best_trial.params

In [91]:
# if tuned_model_params is None:
#     tuned_model_params = tune_model_params(
#                             study_name=Config.MODEL_TYPE + "_ModelTuning", 
#                             study_direction="maximize",
#                             num_trials=Config.NUM_TUNING_TRIALS,
#                             model_name=Config.MODEL_TYPE,
#                             df_train=df_train,
#                             feature_cols=feature_cols,
#                             metric=Config.METRIC,
#                             target_col_name=Config.TARGET_COL_NAME,
#                             single_fold=Config.TUNE_ON_SINGLE_FOLD,
#                             num_folds=Config.NUM_FOLDS
#                     )

In [92]:
model_params = None

In [93]:
fold_metrics_model = tt.train_model(
                            df = df_train,
                            model_name=Config.MODEL_TYPE,
                            model_params = model_params,
                            feature_col_names = feature_cols,
                            target_col_name = Config.TARGET_COL_NAME,
                            metric = Config.METRIC,
                            num_folds = Config.NUM_FOLDS,
                            single_fold = Config.TRAIN_SINGLE_FOLD,
                            persist_model = Config.PERSIST_MODEL,
                            output_path = DATA_WRITEPATH,
                            transform_target = Config.TRANSFORM_TARGET
                        )

training Lasso
Fold 0 - Lasso - R2 : -2.88051436703185e-06
Fold 1 - Lasso - R2 : -2.0433274774411814e-06
Fold 2 - Lasso - R2 : -5.270450151861894e-07
Fold 3 - Lasso - R2 : -6.601710500486746e-07
Fold 4 - Lasso - R2 : -9.280133167699489e-06


KeyboardInterrupt: 