In [1]:
!pip install -q openfe

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import xgboost as xgb
import lightgbm as lgbm
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, SplineTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from functools import partial
from joblib import dump
from scipy.stats import skew, kurtosis
from openfe import OpenFE, transform
import warnings

warnings.filterwarnings('ignore')


In [3]:
# sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [6]:
import train_tabular_utils as tt
import cv_split_utils
import enums
import data_utils

In [7]:
class Config:
    RUN_MODE = "KAGGLE"
    RANDOM_SEED = 1
    NUM_FOLDS = 5
    TARGET_COL_NAME = "FloodProbability"        
    SCALER = enums.Scaler.StandardScaler
    METRIC = enums.Metrics.R2
    # These values are more dynamic   
    MODEL_TYPE = enums.ModelName.Ridge    
    NUM_TUNING_TRIALS = 2
    TUNE_ON_SINGLE_FOLD = True
    TRAIN_SINGLE_FOLD = False
    GENERATE_AUTO_FEATURES = False
    PERSIST_MODEL = False
    TRANSFORM_TARGET = False

COLS_TO_LEAVE = ["FloodProbability", "kfold"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
DATA_WRITEPATH = "./output/"
SUBMISSION_FILEPATH = DATA_READPATH
if Config.RUN_MODE == "KAGGLE":
    # If we are not generating features, we are using already generated features
    if Config.GENERATE_AUTO_FEATURES:
        DATA_READPATH = "/kaggle/input/playground-series-s4e5/"
        SUBMISSION_FILEPATH = DATA_READPATH
    else:
        DATA_READPATH = "/kaggle/input/playground-series-s4e5/"
        SUBMISSION_FILEPATH = "/kaggle/input/playground-series-s4e5/"
    DATA_WRITEPATH = "/kaggle/working/"

In [8]:
model_static_params = {
    enums.ModelName.XGBoost: {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": Config.RANDOM_SEED,
        "verbosity": 0,
    },
    enums.ModelName.LGBM: {
        "objective": "root_mean_squared_error",
        "metric": 'rmse',
        "verbosity": -1,    # <0: fatal, =0: error (warn), =1: info, >1: debug
        "boosting_type": "gbdt"
    },
    enums.ModelName.CatBoost: {
        "objective": "RMSE",
        "verbose": 0,
        "random_seed": Config.RANDOM_SEED,
        "eval_metric": "RMSE",
        'grow_policy':  'Lossguide',
        'bootstrap_type': 'Poisson',
        'task_type': 'GPU'
    },
    enums.ModelName.RandomForest: {
        "random_state": Config.RANDOM_SEED,
        "n_jobs": -1
    },
    enums.ModelName.Ridge: {
        "random_state": Config.RANDOM_SEED
    }
}

In [9]:
tuned_model_params = None

In [10]:
# import train dataset locally from data folder
df_train = pd.read_csv(DATA_READPATH + "train.csv", index_col='id')
# import test dataset locally from data folder
df_test = pd.read_csv(DATA_READPATH + "test.csv", index_col='id')
# keep a copy of original train and test data for later use
# df_train_orig = df_train.copy()
# df_test_orig = df_test.copy()

In [11]:
feature_cols_for_fe = df_test.columns.to_list()

In [12]:
# Function to compute skewness and kurtosis for each row
def compute_skew_kurtosis(matrix):
    skewness = skew(matrix, axis=1)
    kurt = kurtosis(matrix, axis=1)
    return skewness, kurt

def create_features(df, feature_cols):
    # Create a new feature by summing all features
    df["f_sum"] = df[feature_cols].sum(axis=1)
    # Create a new feature by taking mean of all features
    df["f_mean"] = df[feature_cols].mean(axis=1)
    df["f_median"] = df[feature_cols].median(axis=1)
    # standard deviation
    df['f_std'] = df[feature_cols].std(axis=1)
    # min and max
    df['f_min'] = df[feature_cols].min(axis=1)
    df['f_max'] = df[feature_cols].max(axis=1)
    # Compute skewness and kurtosis
    skewness, kurt = compute_skew_kurtosis(df[feature_cols].values)
    df['f_skew'] = skewness
    df['f_kurtosis'] = kurt    
    # Quantiles
    quantiles = [0.25, 0.5, 0.75]
    for q in quantiles:
        df[f'f_quantile_{int(q*100)}'] = df[feature_cols].quantile(q=q, axis=1)        
    # sorted features
    sorted_features = [f"sort_{i}" for i in np.arange(len(feature_cols))]
    df[sorted_features] = np.sort(df[feature_cols], axis=1)
    return df

In [13]:
df_train = create_features(df_train, feature_cols_for_fe)
df_test = create_features(df_test, feature_cols_for_fe)

In [14]:
df_train.head()

Unnamed: 0_level_0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,sort_10,sort_11,sort_12,sort_13,sort_14,sort_15,sort_16,sort_17,sort_18,sort_19
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5,8,5,8,6,4,4,3,3,4,...,5,5,5,5,5,6,7,7,8,8
1,6,7,4,4,8,8,3,5,4,6,...,4,5,5,6,6,7,7,8,8,9
2,6,5,6,7,3,7,1,5,4,5,...,5,6,6,6,6,7,7,7,7,8
3,3,4,6,5,4,8,4,7,6,8,...,5,5,6,6,6,7,7,7,8,8
4,5,3,2,6,4,4,3,3,3,3,...,3,4,4,4,5,5,5,6,6,6


In [15]:
feature_cols= [x for x in df_train.columns.to_list() if x not in COLS_TO_LEAVE]
print(f"len(feature_cols)={len(feature_cols)}")

len(feature_cols)=51


In [16]:
scaler = StandardScaler()
# get the unique values of f_sum from both df_test and df_train and combine them
f_sum_categories = list(set(df_test['f_sum'].unique().tolist() + df_train['f_sum'].unique().tolist()))
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore", categories=[f_sum_categories])
polynomial_features = PolynomialFeatures(degree=2, include_bias=False)
spline_transformer = SplineTransformer(n_knots=5, degree=3, include_bias=False)
preprocessor = ColumnTransformer(
    transformers=[        
        #("poly", polynomial_features, feature_cols),
        ("scaler", scaler, feature_cols),
        ("onehot", onehot_encoder, ['f_sum']),        
    ], remainder="passthrough"
)

In [17]:
def get_lgbm_tuning_params(trial):    
    params_dynamic = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000, step=25),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'num_leaves': trial.suggest_int('num_leaves', 4, 256, step=4),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 100),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 500, step=20)
    }
    return {**model_static_params[enums.ModelName.LGBM], **params_dynamic}

In [18]:
def get_catboost_tuning_params(trial):
    params_dynamic = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000, step=50),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        # comment colsample_bylevel for GPU training
        #'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1),
        'num_leaves': trial.suggest_int('num_leaves', 4, 256, step=4),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 100),
        'random_strength': trial.suggest_loguniform('random_strength', 0.01, 10),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 500, step=20),
        'max_bin': trial.suggest_int('max_bin', 32, 255)
    }
    return {**model_static_params[enums.ModelName.CatBoost], **params_dynamic}

In [19]:
def get_xgb_tuning_params(trial):
    params_dynamic = {            
            'n_estimators': trial.suggest_int('n_estimators', 100, 5000, step=100),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
            'max_depth': trial.suggest_int('max_depth', 4, 32),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0, 1),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 1, 100),
            'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 100, 500, step=20)
        }
    return {**model_static_params[enums.ModelName.XGBoost], **params_dynamic}

In [20]:
def get_model_tuning_params(trial, model_name):
    if model_name == enums.ModelName.Ridge:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.Lasso:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.RandomForest:
        return {        
            "n_estimators": trial.suggest_int("n_estimators", 400, 3000, step=100),
            "max_depth": trial.suggest_int("max_depth", 10, 30),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 16),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
            "max_features": trial.suggest_categorical("max_features", ["log2", "sqrt", None])
        }
    if model_name == enums.ModelName.CatBoost:
        return get_catboost_tuning_params(trial)
    if model_name == enums.ModelName.LGBM:
        return get_lgbm_tuning_params(trial)
    if model_name == enums.ModelName.XGBoost:
        return get_xgb_tuning_params(trial)

In [21]:
def hyperparams_tuning_objective(trial, model_name, preprocessor, df,  
                                 feature_cols, metric, target_col_name, single_fold=False, num_folds=5):               
    model_params = get_model_tuning_params(trial, model_name)    
    fold_metrics_model, _, _ = tt.train_and_validate(
                                        model_name=model_name,
                                        model_params=model_params,
                                        preprocessor=preprocessor,
                                        df=df,
                                        feature_cols=feature_cols,
                                        target_col_name=target_col_name,
                                        metric=metric,
                                        single_fold=single_fold,
                                        num_folds=num_folds
                                    )
    fold_metrics = [x[0] for x in fold_metrics_model]
    mean_metric = statistics.mean(fold_metrics)                
    return mean_metric

In [22]:
def tune_model_params(study_name, study_direction, num_trials, model_name, 
                      preprocessor, df,  feature_cols, metric, target_col_name, 
                      single_fold=False, num_folds=5):
    model_params_tuning_obj_partial = partial(
        hyperparams_tuning_objective,
        model_name=model_name,        
        preprocessor=preprocessor,        
        df=df,
        feature_cols=feature_cols,
        metric=metric,
        target_col_name=target_col_name,
        single_fold=single_fold,
        num_folds=num_folds
    )
    study = optuna.create_study(direction=study_direction, study_name=study_name)
    study.optimize(model_params_tuning_obj_partial, n_trials=num_trials)
    best_trial = study.best_trial
    print(f"Best trial: number = {best_trial.number}, value = {best_trial.value}, params = {best_trial.params}")
    return best_trial.params

In [23]:
df_train = cv_split_utils.kfold_dataframe(df_train, random_state=Config.RANDOM_SEED, num_folds=Config.NUM_FOLDS)

In [24]:
if tuned_model_params is None:
    #df = df_train.sample(frac=0.1, random_state=Config.RANDOM_SEED)
    tuned_model_params = tune_model_params(
                            study_name=Config.MODEL_TYPE + "_ModelTuning", 
                            study_direction="maximize",
                            num_trials=Config.NUM_TUNING_TRIALS,
                            model_name=Config.MODEL_TYPE,
                            preprocessor=preprocessor,
                            df=df_train,
                            feature_cols=feature_cols,
                            metric=Config.METRIC,
                            target_col_name=Config.TARGET_COL_NAME,
                            single_fold=Config.TUNE_ON_SINGLE_FOLD,
                            num_folds=Config.NUM_FOLDS
                    )

[I 2024-06-02 05:58:40,075] A new study created in memory with name: Ridge_ModelTuning
[I 2024-06-02 05:58:45,429] Trial 0 finished with value: 0.8658584538557496 and parameters: {'alpha': 0.011437201267372486}. Best is trial 0 with value: 0.8658584538557496.
[I 2024-06-02 05:58:50,711] Trial 1 finished with value: 0.8658584385237204 and parameters: {'alpha': 0.00042004487049621456}. Best is trial 0 with value: 0.8658584538557496.


Best trial: number = 0, value = 0.8658584538557496, params = {'alpha': 0.011437201267372486}


In [25]:
model_params = None
params_static = model_static_params.get(Config.MODEL_TYPE)
if params_static is not None and tuned_model_params is not None:
    model_params = {**model_static_params[Config.MODEL_TYPE], **tuned_model_params}
else:
    model_params = tuned_model_params

In [26]:
fold_metrics_model, df_oof_preds, preprocessor = tt.train_and_validate(
        model_name=Config.MODEL_TYPE,
        model_params=model_params,
        preprocessor=preprocessor,
        df=df_train,
        feature_cols=feature_cols,
        target_col_name=Config.TARGET_COL_NAME,
        metric=Config.METRIC,
        single_fold=Config.TRAIN_SINGLE_FOLD,
        num_folds=Config.NUM_FOLDS
)

In [27]:
tt.get_cv_score(
        fold_metrics_model, 
        model_name=Config.MODEL_TYPE, 
        metric=Config.METRIC, 
        df_oof_preds=df_oof_preds, 
        target_col_name=Config.TARGET_COL_NAME
)

tt.persist(
        model_name=Config.MODEL_TYPE, 
        fold_metrics_model=fold_metrics_model, 
        df_oof_preds=df_oof_preds, 
        persist_model=Config.PERSIST_MODEL, 
        output_path=DATA_WRITEPATH
)

Fold 0 - Ridge - R2 : 0.8658584538557496
Fold 1 - Ridge - R2 : 0.8661769049731769
Fold 2 - Ridge - R2 : 0.8656843690168684
Fold 3 - Ridge - R2 : 0.8658438044767938
Fold 4 - Ridge - R2 : 0.8662382146220116
Ridge metric=R2 CV score = 0.8659606673579698
Ridge Mean R2 = 0.8659603493889201, std = 0.0002117715707143955
Saved validation data predictions to df_val_preds_Ridge.csv


In [28]:
df_fold_test_preds = tt.get_test_preds(fold_metrics_model, df_test, feature_cols, preprocessor=preprocessor, num_folds=Config.NUM_FOLDS)
print(f"Completed prediction for {len(df_test)} test rows")

Completed prediction for 745305 test rows


In [29]:
df_submission = pd.read_csv(SUBMISSION_FILEPATH + 'sample_submission.csv')
df_submission[Config.TARGET_COL_NAME]= df_fold_test_preds["test_preds"]
df_submission.to_csv(DATA_WRITEPATH + f'submission_{Config.MODEL_TYPE}.csv',index=False)
df_fold_test_preds.to_csv(DATA_WRITEPATH + f'{Config.MODEL_TYPE}_test_preds.csv',index=False)
df_submission.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.577687
1,1117958,0.45141
2,1117959,0.45177
3,1117960,0.472084
4,1117961,0.472725
