In [83]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.metrics import root_mean_squared_log_error
from functools import partial
import warnings

warnings.filterwarnings('ignore')


In [84]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [85]:
import train_tabular as tt
import cv_split_utils
import enums

In [86]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "Rings"    
    SKEW_THRESHOLD = 0.5
    EARLY_STOPPING = 500
    RESULTS_FILE = "model_execution_results.pkl"
    MODEL_TYPE = enums.ModelName.RandomForest
    REMOVE_OUTLIERS = True
    POWER_TRANSFORM = False
    NORMALIZE_DATA = True
    SCALER = enums.Scaler.StandardScaler
    METRIC = enums.Metrics.RMSLE

DATA_PATH = "./data/"
COLS_TO_LEAVE = ["Rings", "kfold"]

In [87]:
# import train dataset locally from data folder
df_train = pd.read_csv("./data/train.csv")
# import test dataset locally from data folder
df_test = pd.read_csv("./data/test.csv")
# drop id column
df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [88]:
df_train = cv_split_utils.strat_kfold_dataframe(
                                    df=df_train, 
                                    target_col_name=Config.TARGET_COL_NAME, 
                                    num_folds=Config.NUM_FOLDS,
                                    random_state=Config.RANDOM_SEED
                                )
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,kfold
0,I,0.49,0.38,0.125,0.529,0.2165,0.1375,0.155,7,3
1,I,0.42,0.345,0.1,0.3705,0.1625,0.0795,0.1025,7,3
2,M,0.555,0.44,0.135,0.739,0.3515,0.1575,0.235,9,0
3,F,0.535,0.41,0.14,0.709,0.2505,0.17,0.19,9,4
4,F,0.605,0.455,0.15,1.059,0.4275,0.221,0.31,10,2


In [89]:
cols_float = df_train.select_dtypes(include=["float"]).columns.to_list()
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()

In [90]:
def process_outliers_iqr(df, col_name, remove_outliers=True):
    Q1 = df[col_name].quantile(0.25)
    Q3 = df[col_name].quantile(0.75)
    IQR = Q3 - Q1    
    min_val = Q1 - 1.5 * IQR
    max_val = Q3 + 1.5 * IQR    
    outlier_count = df[(df[col_name] < min_val) | (df[col_name] > max_val)].shape[0]
    if remove_outliers:
        df = df[(df[col_name] >= min_val) & (df[col_name] <= max_val)]
    # Create a DataFrame for the results
    result = pd.DataFrame({
        'col_name': [col_name],
        'Q1': [Q1],
        'Q3': [Q3],
        'IQR': [IQR],
        'min_val': [min_val],
        'max_val': [max_val],
        'outlier_count': [outlier_count]
    })    
    return df, result

In [91]:
def power_transform(df, col_name, skew_threshold=0.5):    
    transformed = False
    skew = df[col_name].skew()
    print(f"{col_name} has skewness of {skew}")
    power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)    
    if abs(skew) > skew_threshold:
        transformed = True
        print("Will apply power transform.")
        col_transformed = power_transformer.fit_transform(df[[col_name]])
        df.loc[:, col_name] = col_transformed
    return df, transformed

In [92]:
# Create an empty DataFrame to store the results
df_float_outliers = pd.DataFrame(columns=['col_name', 'Q1', 'Q3', 'IQR', 'min_val', 'max_val', 'outlier_count'])
for col_name in cols_float:
    df_train, df_col_ouliers = process_outliers_iqr(df_train, col_name, Config.REMOVE_OUTLIERS)
    df_float_outliers = df_float_outliers.append(df_col_ouliers)
    if Config.POWER_TRANSFORM:
        df_train, transformed = power_transform(df_train, col_name, Config.SKEW_THRESHOLD)
df_float_outliers = df_float_outliers.reset_index(drop=True)
df_float_outliers

Unnamed: 0,col_name,Q1,Q3,IQR,min_val,max_val,outlier_count
0,Length,0.445,0.6,0.155,0.2125,0.8325,1460
1,Diameter,0.35,0.47,0.12,0.17,0.65,372
2,Height,0.11,0.16,0.05,0.035,0.235,73
3,Whole weight,0.4405,1.073,0.6325,-0.50825,2.02175,621
4,Whole weight.1,0.1865,0.4625,0.276,-0.2275,0.8765,600
5,Whole weight.2,0.0905,0.231,0.1405,-0.12025,0.44175,130
6,Shell weight,0.126,0.3005,0.1745,-0.13575,0.56225,593


In [93]:
# one hot encoding of categorical variables
df_train_onehot = pd.get_dummies(df_train, columns=cols_str)

In [94]:
feature_cols = df_train_onehot.columns.drop(["Rings", "kfold"]).to_list()
feature_cols_to_normalize = cols_float

In [95]:
if Config.NORMALIZE_DATA:
    df_train_onehot = tt.normalize_features(df_train_onehot, 
                                            scaler=Config.SCALER,
                                            features_to_normalize=feature_cols_to_normalize)

In [96]:
# # ridge model
# params_ridge = {"alpha": 1963.746}
# val_preds_col = "val_preds"
# model = tt.get_model(Config.MODEL_TYPE, params_ridge)        
# fold_metrics_model, df_val_preds = tt.run_training(
#             model=model,
#             df_train=df_train_onehot,
#             target_col_name=Config.TARGET_COL_NAME,
#             feature_col_names=feature_cols,
#             metric=enums.Metrics.RMSLE,            
#             num_folds=Config.NUM_FOLDS,
#             gb_params=None,
#             val_preds_col=val_preds_col,
#             single_fold=False
#         )       

In [97]:
def get_model_tuning_params(trial, model_name):
    if model_name == enums.ModelName.Ridge:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.Lasso:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.RandomForest:
        return {        
            "n_estimators": trial.suggest_int("n_estimators", 400, 3000, step=100),
            "max_depth": trial.suggest_int("max_depth", 10, 30),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 16),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
            "max_features": trial.suggest_categorical("max_features", ["log2", "sqrt", None])
        }

In [98]:
def hyperparams_tuning_objective(trial, model_name, df_train,  
                                 feature_cols, metric, target_col_name, single_fold=False,
                                 num_folds=5, val_preds_col="val_preds"):           
    model_params = get_model_tuning_params(trial, model_name)
    model = tt.get_model(model_name, model_params)
    fold_metrics_model, df_val_preds = tt.run_training(
        model=model,
        df_train=df_train,
        target_col_name=target_col_name,
        feature_col_names=feature_cols,
        metric=metric,            
        num_folds=num_folds,
        gb_params=None,
        val_preds_col=val_preds_col,
        single_fold=single_fold,
        suppress_print=True
    )       
    fold_metrics = [x[0] for x in fold_metrics_model]
    mean_metric = statistics.mean(fold_metrics)                
    return mean_metric

In [99]:
def tune_model_params(study_name, study_direction, num_trials, model_name, 
                      df_train,  feature_cols, metric, target_col_name, 
                      single_fold=False, num_folds=5, val_preds_col="val_preds"):
    model_params_tuning_obj_partial = partial(
        hyperparams_tuning_objective,
        model_name=model_name,        
        df_train=df_train,
        feature_cols=feature_cols,
        metric=metric,
        target_col_name=target_col_name,
        single_fold=single_fold,
        num_folds=num_folds,
        val_preds_col=val_preds_col
    )
    study = optuna.create_study(direction=study_direction, study_name=study_name)
    study.optimize(model_params_tuning_obj_partial, n_trials=num_trials)
    best_trial = study.best_trial
    print(f"Best trial: number = {best_trial.number}, value = {best_trial.value}, params = {best_trial.params}")
    return best_trial.params

In [100]:
tuned_model_params = tune_model_params(
                         study_name="RidgeModelTuning", 
                         study_direction="minimize",
                         num_trials=20,
                         model_name=Config.MODEL_TYPE,
                         df_train=df_train_onehot,
                         feature_cols=feature_cols,
                         metric=Config.METRIC,
                         target_col_name=Config.TARGET_COL_NAME,
                         single_fold=True,
                         num_folds=Config.NUM_FOLDS                         
                    )

[32m[I 2024-05-06 11:52:32,520][0m A new study created in memory with name: RidgeModelTuning[0m
[32m[I 2024-05-06 11:53:02,601][0m Trial 0 finished with value: 0.1514273889881829 and parameters: {'n_estimators': 2100, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 16, 'max_features': 'log2'}. Best is trial 0 with value: 0.1514273889881829.[0m
[32m[I 2024-05-06 11:55:18,071][0m Trial 1 finished with value: 0.150206176933329 and parameters: {'n_estimators': 1900, 'max_depth': 27, 'min_samples_leaf': 6, 'min_samples_split': 9, 'max_features': None}. Best is trial 1 with value: 0.150206176933329.[0m
[32m[I 2024-05-06 11:56:06,013][0m Trial 2 finished with value: 0.1496112366662128 and parameters: {'n_estimators': 2200, 'max_depth': 29, 'min_samples_leaf': 10, 'min_samples_split': 6, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.1496112366662128.[0m
[32m[I 2024-05-06 11:56:32,506][0m Trial 3 finished with value: 0.14967316825108046 and parameters: {'n_es

Best trial: number = 11, value = 0.14948122240295672, params = {'n_estimators': 1200, 'max_depth': 22, 'min_samples_leaf': 7, 'min_samples_split': 2, 'max_features': 'sqrt'}


In [101]:
# Ridge model (remove outliers, normalize data)
# Best trial: number = 18, value = 0.16229338860497053, params = {'alpha': 2120.468857440699}

# Lasso (remove outliers, normalize data)
# Best trial: number = 10, value = 0.1627903459839038, params = {'alpha': 0.033288635201287185}