In [2]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.metrics import root_mean_squared_log_error
from functools import partial
import warnings

warnings.filterwarnings('ignore')


In [3]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [6]:
import train_tabular as tt
import cv_split_utils
import enums

In [7]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "Rings"    
    SKEW_THRESHOLD = 0.5
    EARLY_STOPPING = 500
    RESULTS_FILE = "model_execution_results.pkl"
    MODEL_TYPE = enums.ModelName.Ridge
    REMOVE_OUTLIERS = True
    POWER_TRANSFORM = False
    NORMALIZE_DATA = True
    SCALER = enums.Scaler.StandardScaler

DATA_PATH = "./data/"
COLS_TO_LEAVE = ["Rings", "kfold"]

In [8]:
# import train dataset locally from data folder
df_train = pd.read_csv("./data/train.csv")
# import test dataset locally from data folder
df_test = pd.read_csv("./data/test.csv")
# drop id column
df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [9]:
df_train = cv_split_utils.strat_kfold_dataframe(
                                    df=df_train, 
                                    target_col_name=Config.TARGET_COL_NAME, 
                                    num_folds=Config.NUM_FOLDS,
                                    random_state=Config.RANDOM_SEED
                                )
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,kfold
0,I,0.49,0.38,0.125,0.529,0.2165,0.1375,0.155,7,3
1,I,0.42,0.345,0.1,0.3705,0.1625,0.0795,0.1025,7,3
2,M,0.555,0.44,0.135,0.739,0.3515,0.1575,0.235,9,0
3,F,0.535,0.41,0.14,0.709,0.2505,0.17,0.19,9,4
4,F,0.605,0.455,0.15,1.059,0.4275,0.221,0.31,10,2


In [10]:
cols_float = df_train.select_dtypes(include=["float"]).columns.to_list()
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()

In [11]:
def process_outliers_iqr(df, col_name, remove_outliers=True):
    Q1 = df[col_name].quantile(0.25)
    Q3 = df[col_name].quantile(0.75)
    IQR = Q3 - Q1    
    min_val = Q1 - 1.5 * IQR
    max_val = Q3 + 1.5 * IQR    
    outlier_count = df[(df[col_name] < min_val) | (df[col_name] > max_val)].shape[0]
    if remove_outliers:
        df = df[(df[col_name] >= min_val) & (df[col_name] <= max_val)]
    # Create a DataFrame for the results
    result = pd.DataFrame({
        'col_name': [col_name],
        'Q1': [Q1],
        'Q3': [Q3],
        'IQR': [IQR],
        'min_val': [min_val],
        'max_val': [max_val],
        'outlier_count': [outlier_count]
    })    
    return df, result

In [12]:
def power_transform(df, col_name, skew_threshold=0.5):    
    transformed = False
    skew = df[col_name].skew()
    print(f"{col_name} has skewness of {skew}")
    power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)    
    if abs(skew) > skew_threshold:
        transformed = True
        print("Will apply power transform.")
        col_transformed = power_transformer.fit_transform(df[[col_name]])
        df.loc[:, col_name] = col_transformed
    return df, transformed

In [13]:
# Create an empty DataFrame to store the results
df_float_outliers = pd.DataFrame(columns=['col_name', 'Q1', 'Q3', 'IQR', 'min_val', 'max_val', 'outlier_count'])
for col_name in cols_float:
    df_train, df_col_ouliers = process_outliers_iqr(df_train, col_name, Config.REMOVE_OUTLIERS)
    df_float_outliers = df_float_outliers.append(df_col_ouliers)
    if Config.POWER_TRANSFORM:
        df_train, transformed = power_transform(df_train, col_name, Config.SKEW_THRESHOLD)
df_float_outliers = df_float_outliers.reset_index(drop=True)
df_float_outliers

Unnamed: 0,col_name,Q1,Q3,IQR,min_val,max_val,outlier_count
0,Length,0.445,0.6,0.155,0.2125,0.8325,1460
1,Diameter,0.35,0.47,0.12,0.17,0.65,372
2,Height,0.11,0.16,0.05,0.035,0.235,73
3,Whole weight,0.4405,1.073,0.6325,-0.50825,2.02175,621
4,Whole weight.1,0.1865,0.4625,0.276,-0.2275,0.8765,600
5,Whole weight.2,0.0905,0.231,0.1405,-0.12025,0.44175,130
6,Shell weight,0.126,0.3005,0.1745,-0.13575,0.56225,593


In [14]:
# one hot encoding of categorical variables
df_train_onehot = pd.get_dummies(df_train, columns=cols_str)

In [15]:
feature_cols = df_train_onehot.columns.drop(["Rings", "kfold"]).to_list()
feature_cols_to_normalize = cols_float

In [16]:
if Config.NORMALIZE_DATA:
    df_train_onehot = tt.normalize_features(df_train_onehot, 
                                            scaler=Config.SCALER,
                                            features_to_normalize=feature_cols_to_normalize)

In [18]:
# ridge model
params_ridge = {"alpha": 1963.746}
val_preds_col = "val_preds"
model = tt.get_model(Config.MODEL_TYPE, params_ridge)        
fold_metrics_model, df_val_preds = tt.run_training(
            model=model,
            df_train=df_train_onehot,
            target_col_name=Config.TARGET_COL_NAME,
            feature_col_names=feature_cols,
            metric=enums.Metrics.RMSLE,            
            num_folds=Config.NUM_FOLDS,
            gb_params=None,
            val_preds_col=val_preds_col,
            single_fold=False
        )       

fold 0 metric = 0.16256651522836177
fold 1 metric = 0.16217663077542896
fold 2 metric = 0.16194515077777039
fold 3 metric = 0.16317401451107638
fold 4 metric = 0.16121846056797234


In [None]:
def run_training(model, train_X, train_y, val_X, val_y):    
    model.fit(train_X, train_y.ravel())
    val_y_pred = model.predict(val_X)    
    val_y_pred = [item if item > 0 else 0 for item in val_y_pred]
    rmsle = root_mean_squared_log_error(val_y, val_y_pred)
    return rmsle, model

In [None]:
def hyperparams_tuning_objective(trial, df_train, cols_float, cols_to_leave):       
    params = get_model_tuning_params(trial, Config.MODEL_TYPE)
    model = create_model(params, Config.MODEL_TYPE)
    fold_metric = []
    for fold in range(Config.NUM_FOLDS):
        train_X, train_y, val_X, val_y = get_fold_data(
                                            fold=fold, 
                                            df=df_train, 
                                            cont_col_names=cols_float, 
                                            cols_to_leave=cols_to_leave,
                                            target_col_name=Config.TARGET_COL_NAME
                                        )
        rmsle, _, = run_training(model, train_X, train_y, val_X, val_y)
        fold_metric.append(rmsle)
    mean_metric = statistics.mean(fold_metric)                
    return mean_metric

In [None]:
hyperparams_tuning_obj_partial = partial(
                                        hyperparams_tuning_objective, 
                                         df_train=df_train_onehot, 
                                         cols_float=cols_float, 
                                         cols_to_leave=COLS_TO_LEAVE
                                    )
study = optuna.create_study(direction="minimize", study_name="RidgeModelTuning")    
study.optimize(hyperparams_tuning_obj_partial, n_trials=20,)
best_trial = study.best_trial
print(f"Best trial: number = {best_trial.number}, value = {best_trial.value}, params = {best_trial.params}")

[32m[I 2024-05-03 18:05:09,474][0m A new study created in memory with name: RidgeModelTuning[0m
[32m[I 2024-05-03 18:05:09,659][0m Trial 0 finished with value: 0.17128226671096497 and parameters: {'alpha': 2.624856892088123}. Best is trial 0 with value: 0.17128226671096497.[0m
[32m[I 2024-05-03 18:05:09,835][0m Trial 1 finished with value: 0.17128835315031413 and parameters: {'alpha': 0.10753852580046656}. Best is trial 0 with value: 0.17128226671096497.[0m
[32m[I 2024-05-03 18:05:10,003][0m Trial 2 finished with value: 0.17128827728436977 and parameters: {'alpha': 0.13884167072026338}. Best is trial 0 with value: 0.17128226671096497.[0m
[32m[I 2024-05-03 18:05:10,182][0m Trial 3 finished with value: 0.17128860837684906 and parameters: {'alpha': 0.002242959574595568}. Best is trial 0 with value: 0.17128226671096497.[0m
[32m[I 2024-05-03 18:05:10,360][0m Trial 4 finished with value: 0.17128782227692854 and parameters: {'alpha': 0.32662228898601225}. Best is trial 0 with

Best trial: number = 19, value = 0.17024493049544798, params = {'alpha': 1068.7684217249885}
