In [545]:
# !pip install -q openfe

In [546]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import xgboost as xgb
import lightgbm as lgbm
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, SplineTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from functools import partial
from joblib import dump
from scipy.stats import skew, kurtosis
from openfe import OpenFE, transform
import warnings

warnings.filterwarnings('ignore')


In [547]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [548]:
import train_tabular_utils as tt
import cv_split_utils
import enums
import data_utils

In [549]:
class Config:
    RUN_MODE = "LOCAL"
    RANDOM_SEED = 1
    NUM_FOLDS = 5
    TARGET_COL_NAME = "FloodProbability"        
    SCALER = enums.Scaler.StandardScaler
    METRIC = enums.Metrics.R2
    # These values are more dynamic   
    MODEL_TYPE = enums.ModelName.Ridge    
    NUM_TUNING_TRIALS = 25
    TUNE_ON_SINGLE_FOLD = True
    TRAIN_SINGLE_FOLD = False
    GENERATE_AUTO_FEATURES = False
    PERSIST_MODEL = False
    TRANSFORM_TARGET = False

COLS_TO_LEAVE = ["FloodProbability", "kfold"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
DATA_WRITEPATH = "./output/"
SUBMISSION_FILEPATH = DATA_READPATH
if Config.RUN_MODE == "KAGGLE":
    # If we are not generating features, we are using already generated features
    if Config.GENERATE_AUTO_FEATURES:
        DATA_READPATH = "/kaggle/input/playground-series-s4e5/"
        SUBMISSION_FILEPATH = DATA_READPATH
    else:
        DATA_READPATH = "/kaggle/input/playground-series-s4e5/"
        SUBMISSION_FILEPATH = "/kaggle/input/playground-series-s4e5/"
    DATA_WRITEPATH = "/kaggle/working/"

In [550]:
model_static_params = {
    enums.ModelName.XGBoost: {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": Config.RANDOM_SEED,
        "verbosity": 0,
    },
    enums.ModelName.LGBM: {
        "objective": "root_mean_squared_error",
        "metric": 'rmse',
        "verbosity": -1,    # <0: fatal, =0: error (warn), =1: info, >1: debug
        "boosting_type": "gbdt"
    },
    enums.ModelName.CatBoost: {
        "objective": "RMSE",
        "verbose": 0,
        "random_seed": Config.RANDOM_SEED,
        "eval_metric": "RMSE",
        'grow_policy':  'Lossguide'
    },
    enums.ModelName.RandomForest: {
        "random_state": Config.RANDOM_SEED,
        "n_jobs": -1
    },
    enums.ModelName.Ridge: {
        "random_state": Config.RANDOM_SEED
    }
}

In [551]:
tuned_model_params = None

In [552]:
# import train dataset locally from data folder
df_train = pd.read_csv(DATA_READPATH + "train.csv", index_col='id')
# import test dataset locally from data folder
df_test = pd.read_csv(DATA_READPATH + "test.csv", index_col='id')
# keep a copy of original train and test data for later use
df_train_orig = df_train.copy()
df_test_orig = df_test.copy()

In [553]:
feature_cols_for_fe = df_test.columns.to_list()

In [554]:
def generate_new_features(df_train, df_test, feature_cols, NUM_NEW_FEATURES=10):
    train_X = df_train[feature_cols] 
    test_X = df_test[feature_cols]   
    train_y = df_train[Config.TARGET_COL_NAME]
    ofe = OpenFE()
    features = ofe.fit(data=train_X, label=train_y, n_jobs=CPU_COUNT, verbose=False)  # generate new features
    # OpenFE recommends a list of new features. We include the top 10
    # generated features to see how they influence the model performance
    train_X, test_X = transform(train_X, test_X, ofe.new_features_list[:NUM_NEW_FEATURES], n_jobs=CPU_COUNT)
    return train_X, test_X

In [555]:
if Config.GENERATE_AUTO_FEATURES:
    df_train, df_test = generate_new_features(df_train, df_test, feature_cols_for_fe)    
    df_train_labels = df_train_orig[[Config.TARGET_COL_NAME]]
    # Add the label data to the dataframe
    df_train = pd.concat([df_train, df_train_labels], axis=1)
    # save the new train and test data with openfe features to csv files for later use
    df_train.to_csv(DATA_WRITEPATH + "train_openfe.csv", index=False)
    df_test.to_csv(DATA_WRITEPATH + "test_openfe.csv", index=False)

In [556]:
# Function to compute skewness and kurtosis for each row
def compute_skew_kurtosis(matrix):
    skewness = skew(matrix, axis=1)
    kurt = kurtosis(matrix, axis=1)
    return skewness, kurt

def create_features(df, feature_cols):
    # Create a new feature by summing all features
    df["f_sum"] = df[feature_cols].sum(axis=1)
    # Create a new feature by taking mean of all features
    df["f_mean"] = df[feature_cols].mean(axis=1)
    # standard deviation
    df['f_std'] = df[feature_cols].std(axis=1)
    # min and max
    df['f_min'] = df[feature_cols].min(axis=1)
    df['f_max'] = df[feature_cols].max(axis=1)
    # Compute skewness and kurtosis
    skewness, kurt = compute_skew_kurtosis(df[feature_cols].values)
    df['f_skew'] = skewness
    df['f_kurtosis'] = kurt    
    # Quantiles
    quantiles = [0.25, 0.5, 0.75]
    for q in quantiles:
        df[f'f_quantile_{int(q*100)}'] = df[feature_cols].quantile(q=q, axis=1)
        
    return df

In [557]:
df_train = create_features(df_train, feature_cols_for_fe)

In [559]:
# def preprocess_data(preprocessor, df_train, df_test):
#     preprocessor.fit(df_train)
#     col_names = preprocessor.get_feature_names_out()
#     X_train = preprocessor.transform(df_train)
#     #X_test = preprocessor.transform(df_test)
#     df_train_fold_target = df_train[COLS_TO_LEAVE]
#     df_train_processed = pd.concat([df_train_fold_target, pd.DataFrame(X_train, columns=col_names)], axis=1) 
#     #df_test_processed = pd.DataFrame(X_test, columns=col_names)
#     return df_train_processed

In [560]:
# df_train_processed = preprocess_data(preprocessor=preprocessor, df_train=df_train, df_test=df_test)

In [561]:
def get_model_tuning_params(trial, model_name):
    if model_name == enums.ModelName.Ridge:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.Lasso:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.RandomForest:
        return {        
            "n_estimators": trial.suggest_int("n_estimators", 400, 3000, step=100),
            "max_depth": trial.suggest_int("max_depth", 10, 30),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 16),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
            "max_features": trial.suggest_categorical("max_features", ["log2", "sqrt", None])
        }

In [562]:
def hyperparams_tuning_objective(trial, model_name, df_train,  
                                 feature_cols, metric, target_col_name, single_fold=False,
                                 num_folds=5, val_preds_col="val_preds"):           
    model_params = get_model_tuning_params(trial, model_name)    
    fold_metrics_model, df_val_preds = tt.run_training(
        model_name=model_name,
        df_train=df_train,
        target_col_name=target_col_name,
        feature_col_names=feature_cols,
        metric=metric,            
        num_folds=num_folds,
        model_params=model_params,
        val_preds_col=val_preds_col,
        single_fold=single_fold,
        suppress_print=True,
        transform_target=Config.TRANSFORM_TARGET
    )       
    fold_metrics = [x[0] for x in fold_metrics_model]
    mean_metric = statistics.mean(fold_metrics)                
    return mean_metric

In [563]:
def tune_model_params(study_name, study_direction, num_trials, model_name, 
                      df_train,  feature_cols, metric, target_col_name, 
                      single_fold=False, num_folds=5, val_preds_col="val_preds"):
    model_params_tuning_obj_partial = partial(
        hyperparams_tuning_objective,
        model_name=model_name,        
        df_train=df_train,
        feature_cols=feature_cols,
        metric=metric,
        target_col_name=target_col_name,
        single_fold=single_fold,
        num_folds=num_folds,
        val_preds_col=val_preds_col
    )
    study = optuna.create_study(direction=study_direction, study_name=study_name)
    study.optimize(model_params_tuning_obj_partial, n_trials=num_trials)
    best_trial = study.best_trial
    print(f"Best trial: number = {best_trial.number}, value = {best_trial.value}, params = {best_trial.params}")
    return best_trial.params

In [564]:
# if tuned_model_params is None:
#     tuned_model_params = tune_model_params(
#                             study_name=Config.MODEL_TYPE + "_ModelTuning", 
#                             study_direction="maximize",
#                             num_trials=Config.NUM_TUNING_TRIALS,
#                             model_name=Config.MODEL_TYPE,
#                             df_train=df_train,
#                             feature_cols=feature_cols,
#                             metric=Config.METRIC,
#                             target_col_name=Config.TARGET_COL_NAME,
#                             single_fold=Config.TUNE_ON_SINGLE_FOLD,
#                             num_folds=Config.NUM_FOLDS
#                     )

In [565]:
model_params = None
params_static = model_static_params.get(Config.MODEL_TYPE)
if params_static is not None and tuned_model_params is not None:
    model_params = {**model_static_params[Config.MODEL_TYPE], **tuned_model_params}
else:
    model_params = tuned_model_params

In [566]:
# # Create a new feature by summing all features
# df_train["f_sum"] = df_train[feature_cols_for_fe].sum(axis=1)
# df_train['special1'] = df_train['f_sum'].isin(np.arange(72, 76)) # for linear models
# # Create a new feature by taking mean of all features
# df_train["f_mean"] = df_train[feature_cols_for_fe].mean(axis=1)
# # standard deviation
# df_train['f_std'] = df_train[feature_cols_for_fe].std(axis=1)
# # min and max
# df_train['f_min'] = df_train[feature_cols_for_fe].min(axis=1)
# df_train['f_max'] = df_train[feature_cols_for_fe].max(axis=1)

In [567]:
feature_cols= [x for x in df_train.columns.to_list() if x not in COLS_TO_LEAVE]
print(f"len(feature_cols)={len(feature_cols)}")

In [568]:
# feature_cols = feature_cols_for_fe + ['f_sum','special1', 'f_mean', 'f_std', 'f_min', 'f_max']

In [569]:
scaler = StandardScaler()
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
polynomial_features = PolynomialFeatures(degree=2, include_bias=False)
spline_transformer = SplineTransformer(n_knots=5, degree=3, include_bias=False)
preprocessor = ColumnTransformer(
    transformers=[        
        #("poly", polynomial_features, feature_cols),
        ("scaler", scaler, feature_cols),
        ("onehot", onehot_encoder, ['f_sum']),        
    ], remainder="passthrough"
)

In [570]:
model = tt.get_model(model_name=Config.MODEL_TYPE, params=model_params, metric=Config.METRIC)
model_pipeline = make_pipeline(preprocessor, model)

In [571]:
def train_and_validate(model_name, model, df, feature_cols, target_col_name, metric, n_repeat=1, single_fold=False, num_folds=5):    
    df = cv_split_utils.kfold_dataframe(df, random_state=Config.RANDOM_SEED, num_folds=Config.NUM_FOLDS)
    df_oof_preds = pd.DataFrame()
    fold_metrics_model = []
    for fold in range(num_folds):
        fold_model = clone(model)
        df_train_fold, df_val_fold = tt.get_fold_df(df, fold)
        train_X, train_y, val_X, val_y = tt.get_train_val_nparray(df_train_fold, df_val_fold, feature_cols, target_col_name)
        fold_model.fit(train_X, train_y)
        val_y_pred = fold_model.predict(val_X)
        fold_val_metric = tt.get_eval_metric(metric, val_y, val_y_pred)
        print(f"Fold {fold} - {model_name} - {metric} : {fold_val_metric}")
        df_fold_val_preds = df_val_fold[['kfold', target_col_name]]
        df_fold_val_preds['oof_preds'] = val_y_pred
        df_oof_preds = pd.concat([df_oof_preds, df_fold_val_preds], axis=0)
        fold_metrics_model.append((fold_val_metric, fold_model))
        if single_fold:
            break
    cv = tt.get_eval_metric(metric, df_oof_preds[target_col_name], df_oof_preds['oof_preds'] )
    print(f"{model_name} metric={metric} CV score = {cv}")
    metrics = [item[0] for item in fold_metrics_model]
    mean_metric, std_metric = tt.get_metric_stats(metrics)    
    print(f"{model_name} Mean {metric} = {mean_metric}, std = {std_metric}")        
    return fold_metrics_model, df_oof_preds

In [572]:
def persist(model_name, fold_metrics_model, df_oof_preds, persist_model=False, output_path=""):    
    fold_models = [item[1] for item in fold_metrics_model]    
    if persist_model:
        for index, model in enumerate(fold_models):
            fold_model_name = output_path + f"{model_name}_{index}.joblib"        
            dump(model, fold_model_name)
            print(f"saved {fold_model_name}")    
    df_oof_preds.to_csv(output_path + f"df_val_preds_{model_name}.csv")
    print(f"Saved validation data predictions to df_val_preds_{model_name}.csv")  

In [573]:
fold_metrics_model, df_oof_preds = train_and_validate(
                                        model_name=Config.MODEL_TYPE,
                                        model=model_pipeline,
                                        df=df_train,
                                        feature_cols=feature_cols,
                                        target_col_name=Config.TARGET_COL_NAME,
                                        metric=Config.METRIC,
                                        single_fold=Config.TRAIN_SINGLE_FOLD,
                                        num_folds=Config.NUM_FOLDS
                                    )

persist(
        model_name=Config.MODEL_TYPE, 
        fold_metrics_model=fold_metrics_model, 
        df_oof_preds=df_oof_preds, 
        persist_model=Config.PERSIST_MODEL, 
        output_path=DATA_WRITEPATH
)

Fold 0 - Ridge - R2 : 0.8657955048767867
Fold 1 - Ridge - R2 : 0.8660801615163919
Fold 2 - Ridge - R2 : 0.865600674467472
Fold 3 - Ridge - R2 : 0.8658030180796936
Fold 4 - Ridge - R2 : 0.8661479357837484
Ridge metric=R2 CV score = 0.8658857435280796
Ridge Mean R2 = 0.8658854589448184, std = 0.00020139196184924065
Saved validation data predictions to df_val_preds_Ridge.csv


In [574]:
import datetime
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from colorama import Fore, Style


kf = KFold(n_splits=5, shuffle=True, random_state=1)

SINGLE_FOLD = True

def cross_validate(model, train, label, features=feature_cols, n_repeats=1):
    """Compute out-of-fold and test predictions for a given model.
    
    Out-of-fold and test predictions are stored in the global variables
    oof and test_pred, respectively.
    
    If n_repeats > 1, the model is trained several times with different seeds.
    """
    start_time = datetime.datetime.now()
    scores = []
    oof_preds = np.full_like(train.FloodProbability, np.nan, dtype=float)
    for fold, (idx_tr, idx_va) in enumerate(kf.split(train)):
        X_tr = train.iloc[idx_tr][features]
        X_va = train.iloc[idx_va][features]
        y_tr = train.iloc[idx_tr].FloodProbability
        y_va = train.iloc[idx_va].FloodProbability
        
        y_pred = np.zeros_like(y_va, dtype=float)
        for i in range(n_repeats):
            m = clone(model)
            if n_repeats > 1:
                mm = m
                if isinstance(mm, Pipeline):
                    mm = mm[-1]
                mm.set_params(random_state=i)
            m.fit(X_tr, y_tr)
            y_pred += m.predict(X_va)
        y_pred /= n_repeats                
        score = r2_score(y_va, y_pred)
        print(f"# Fold {fold}: R2={score:.5f}")
        scores.append(score)
        oof_preds[idx_va] = y_pred
        if Config.TRAIN_SINGLE_FOLD: break
            
    elapsed_time = datetime.datetime.now() - start_time
    print(f"{Fore.GREEN}# Overall: {np.array(scores).mean():.5f} {label}"
          f"{' single fold' if SINGLE_FOLD else ''}"
          f"   {int(np.round(elapsed_time.total_seconds() / 60))} min{Style.RESET_ALL}")

In [575]:
#cross_validate(model_pipeline, df_train, "Ridge", features=feature_cols)

In [576]:
# fold_metrics_model = tt.train_model(
#                             df = df_train_processed,
#                             model_name=Config.MODEL_TYPE,
#                             model_params = model_params,
#                             feature_col_names = feature_cols,
#                             target_col_name = Config.TARGET_COL_NAME,
#                             metric = Config.METRIC,
#                             num_folds = Config.NUM_FOLDS,
#                             single_fold = Config.TRAIN_SINGLE_FOLD,
#                             persist_model = Config.PERSIST_MODEL,
#                             output_path = DATA_WRITEPATH,
#                             transform_target = Config.TRANSFORM_TARGET
#                         )

In [577]:
# from sklearn.metrics import r2_score

# r2_score(df_train.FloodProbability, (df_train[feature_cols_for_fe].sum(axis=1) * 0.0056) - 0.05)