In [54]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.metrics import root_mean_squared_log_error
from functools import partial
import warnings

warnings.filterwarnings('ignore')


In [55]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [56]:
import train_tabular as tt
import cv_split_utils

In [57]:
class ModelType:
    LGBM = "LGBM"
    XGB = "XGB"
    RF = "RF"
    RIDGE = "Ridge"
    CATBOOST = "CATBOOST"

In [58]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "Rings"    
    SKEW_THRESHOLD = 0.5
    EARLY_STOPPING = 500
    RESULTS_FILE = "model_execution_results.pkl"
    MODEL_TYPE = ModelType.RIDGE
    REMOVE_OUTLIERS = True
    POWER_TRANSFORM = False
    NORMALIZE_DATA = True
    SCALER = "StandardScaler"    

DATA_PATH = "./data/"
COLS_TO_LEAVE = ["Rings", "kfold"]

In [59]:
# import train dataset locally from data folder
df_train = pd.read_csv("./data/train.csv")
# import test dataset locally from data folder
df_test = pd.read_csv("./data/test.csv")
# drop id column
df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [60]:
df_train = cv_split_utils.strat_kfold_dataframe(
                                    df=df_train, 
                                    target_col_name=Config.TARGET_COL_NAME, 
                                    num_folds=Config.NUM_FOLDS,
                                    random_state=Config.RANDOM_SEED
                                )
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,kfold
0,I,0.49,0.38,0.125,0.529,0.2165,0.1375,0.155,7,3
1,I,0.42,0.345,0.1,0.3705,0.1625,0.0795,0.1025,7,3
2,M,0.555,0.44,0.135,0.739,0.3515,0.1575,0.235,9,0
3,F,0.535,0.41,0.14,0.709,0.2505,0.17,0.19,9,4
4,F,0.605,0.455,0.15,1.059,0.4275,0.221,0.31,10,2


In [61]:
cols_float = df_train.select_dtypes(include=["float"]).columns.to_list()
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()

In [62]:
def process_outliers_iqr(df, col_name, remove_outliers=True):
    Q1 = df[col_name].quantile(0.25)
    Q3 = df[col_name].quantile(0.75)
    IQR = Q3 - Q1    
    min_val = Q1 - 1.5 * IQR
    max_val = Q3 + 1.5 * IQR    
    outlier_count = df[(df[col_name] < min_val) | (df[col_name] > max_val)].shape[0]
    if remove_outliers:
        df = df[(df[col_name] >= min_val) & (df[col_name] <= max_val)]
    # Create a DataFrame for the results
    result = pd.DataFrame({
        'col_name': [col_name],
        'Q1': [Q1],
        'Q3': [Q3],
        'IQR': [IQR],
        'min_val': [min_val],
        'max_val': [max_val],
        'outlier_count': [outlier_count]
    })    
    return df, result

In [63]:
def power_transform(df, col_name, skew_threshold=0.5):    
    transformed = False
    skew = df[col_name].skew()
    print(f"{col_name} has skewness of {skew}")
    power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)    
    if abs(skew) > skew_threshold:
        transformed = True
        print("Will apply power transform.")
        col_transformed = power_transformer.fit_transform(df[[col_name]])
        df.loc[:, col_name] = col_transformed
    return df, transformed

In [64]:
# Create an empty DataFrame to store the results
df_float_outliers = pd.DataFrame(columns=['col_name', 'Q1', 'Q3', 'IQR', 'min_val', 'max_val', 'outlier_count'])
for col_name in cols_float:
    df_train, df_col_ouliers = process_outliers_iqr(df_train, col_name, Config.REMOVE_OUTLIERS)
    df_float_outliers = df_float_outliers.append(df_col_ouliers)
    if Config.POWER_TRANSFORM:
        df_train, transformed = power_transform(df_train, col_name, Config.SKEW_THRESHOLD)
df_float_outliers = df_float_outliers.reset_index(drop=True)
df_float_outliers

Unnamed: 0,col_name,Q1,Q3,IQR,min_val,max_val,outlier_count
0,Length,0.445,0.6,0.155,0.2125,0.8325,1460
1,Diameter,0.35,0.47,0.12,0.17,0.65,372
2,Height,0.11,0.16,0.05,0.035,0.235,73
3,Whole weight,0.4405,1.073,0.6325,-0.50825,2.02175,621
4,Whole weight.1,0.1865,0.4625,0.276,-0.2275,0.8765,600
5,Whole weight.2,0.0905,0.231,0.1405,-0.12025,0.44175,130
6,Shell weight,0.126,0.3005,0.1745,-0.13575,0.56225,593


In [65]:
# one hot encoding of categorical variables
df_train_onehot = pd.get_dummies(df_train, columns=cols_str)

In [66]:
feature_cols = df_train_onehot.columns.drop(["Rings", "kfold"]).to_list()

In [67]:
def extract_features(df, cont_col_names, cols_to_leave):
    # normalize continuous features
    scaler = None
    if Config.SCALER == "StandardScaler":
        scaler = StandardScaler()
    elif Config.SCALER == "RobustScaler":
        scaler = RobustScaler()
    else:
        scaler = MinMaxScaler()
    X_cont = df[cont_col_names]
    if Config.NORMALIZE_DATA:    
        X_cont = scaler.fit_transform(X_cont)     
    # get the columns other than continuous features
    other_col_names = [item for item in df.columns.values.tolist() if item not in cont_col_names + cols_to_leave]
    # combine the normalized continuous features with others
    X_processed = np.concatenate([X_cont, df[other_col_names]], axis=1)    
    return X_processed

In [68]:
def get_fold_data(fold, df, cont_col_names, cols_to_leave, target_col_name):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]        
    X_train = extract_features(df_train, cont_col_names, cols_to_leave)
    X_val = extract_features(df_val, cont_col_names, cols_to_leave)
    y_train = df_train[target_col_name]
    y_val = df_val[target_col_name]
    return X_train, y_train, X_val, y_val 

In [69]:
def create_model(model_params, model_type):
    model = None
    if model_type == ModelType.RIDGE:
        model = Ridge(
            random_state=Config.RANDOM_SEED,
            alpha=model_params["alpha"]
        )
    # elif model_type == ModelType.RF:
    #     model = RandomForestClassifier(
    #                 n_estimators=model_params["n_estimators"],                 
    #                 max_depth=model_params["max_depth"],
    #                 min_samples_leaf=model_params["min_samples_leaf"],
    #                 min_samples_split=model_params["min_samples_split"],
    #                 max_features=model_params["max_features"],
    #                 random_state=Config.RANDOM_SEED,
    #                 n_jobs=-1
    #             )     
    return model

In [70]:
def get_model_tuning_params(trial, model_type):
    if model_type == ModelType.RIDGE:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }

In [71]:
def run_training(model, train_X, train_y, val_X, val_y):    
    model.fit(train_X, train_y.ravel())
    val_y_pred = model.predict(val_X)    
    val_y_pred = [item if item > 0 else 0 for item in val_y_pred]
    rmsle = root_mean_squared_log_error(val_y, val_y_pred)
    return rmsle, model

In [72]:
def hyperparams_tuning_objective(trial, df_train, cols_float, cols_to_leave):       
    params = get_model_tuning_params(trial, Config.MODEL_TYPE)
    model = create_model(params, Config.MODEL_TYPE)
    fold_metric = []
    for fold in range(Config.NUM_FOLDS):
        train_X, train_y, val_X, val_y = get_fold_data(
                                            fold=fold, 
                                            df=df_train, 
                                            cont_col_names=cols_float, 
                                            cols_to_leave=cols_to_leave,
                                            target_col_name=Config.TARGET_COL_NAME
                                        )
        rmsle, _, = run_training(model, train_X, train_y, val_X, val_y)
        fold_metric.append(rmsle)
    mean_metric = statistics.mean(fold_metric)                
    return mean_metric

In [73]:
hyperparams_tuning_obj_partial = partial(
                                        hyperparams_tuning_objective, 
                                         df_train=df_train_onehot, 
                                         cols_float=cols_float, 
                                         cols_to_leave=COLS_TO_LEAVE
                                    )
study = optuna.create_study(direction="minimize", study_name="RidgeModelTuning")    
study.optimize(hyperparams_tuning_obj_partial, n_trials=20,)
best_trial = study.best_trial
print(f"Best trial: number = {best_trial.number}, value = {best_trial.value}, params = {best_trial.params}")

[32m[I 2024-05-04 21:53:19,980][0m A new study created in memory with name: RidgeModelTuning[0m
[32m[I 2024-05-04 21:53:20,298][0m Trial 0 finished with value: 0.1630465201179936 and parameters: {'alpha': 0.0011500035478797823}. Best is trial 0 with value: 0.1630465201179936.[0m
[32m[I 2024-05-04 21:53:20,540][0m Trial 1 finished with value: 0.16301092138584422 and parameters: {'alpha': 18.326228903718363}. Best is trial 1 with value: 0.16301092138584422.[0m
[32m[I 2024-05-04 21:53:20,799][0m Trial 2 finished with value: 0.16304044352881442 and parameters: {'alpha': 3.0242801756049498}. Best is trial 1 with value: 0.16301092138584422.[0m
[32m[I 2024-05-04 21:53:21,110][0m Trial 3 finished with value: 0.16304650701222795 and parameters: {'alpha': 0.007625847226755735}. Best is trial 1 with value: 0.16301092138584422.[0m
[32m[I 2024-05-04 21:53:21,294][0m Trial 4 finished with value: 0.16304648619384832 and parameters: {'alpha': 0.017913052068439322}. Best is trial 1 wit

Best trial: number = 19, value = 0.1622034958985353, params = {'alpha': 1963.7467180572462}


In [74]:
scaler = RobustScaler()
df_float = df_train_onehot[cols_float]
other_col_names = [item for item in df_train_onehot.columns.values.tolist() if item not in cols_float]
cat_cols = [item for item in other_col_names if item not in COLS_TO_LEAVE]
df_float_scaled = pd.DataFrame(scaler.fit_transform(df_float), columns = df_float.columns)
df_train_other = df_train_onehot[other_col_names]
df_train_oh_scaled = pd.concat([df_float_scaled, df_train_other], axis=1)
df_train_oh_scaled.head(5)

Unnamed: 0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,kfold,Sex_F,Sex_I,Sex_M
0,-0.344828,-0.36,-0.3,-0.431562,-0.414679,-0.206522,-0.4,7.0,3.0,0.0,1.0,0.0
1,-0.827586,-0.64,-0.8,-0.686795,-0.612844,-0.626812,-0.7,7.0,3.0,0.0,1.0,0.0
2,0.103448,0.12,-0.1,-0.093398,0.080734,-0.061594,0.057143,9.0,0.0,0.0,0.0,1.0
3,-0.034483,-0.12,0.0,-0.141707,-0.289908,0.028986,-0.2,9.0,4.0,1.0,0.0,0.0
4,0.448276,0.24,0.2,0.4219,0.359633,0.398551,0.485714,10.0,2.0,1.0,0.0,0.0


In [75]:
df_train_0 = df_train_oh_scaled[df_train_oh_scaled.kfold != 0]
df_val_0 = df_train_oh_scaled[df_train_oh_scaled.kfold == 0]
dummy_model = Ridge()
# fit dummy_model on cols_float of df_train_oh_scaled
dummy_model = dummy_model.fit(X=df_train_0[cols_float + cat_cols], y=df_train_0["Rings"])
y_val_0_pred = dummy_model.predict(df_val_0[cols_float + cat_cols])
y_val_0_pred = np.where(y_val_0_pred < 0, 0, y_val_0_pred)
y_val_0_pred = [round(item) for item in y_val_0_pred]
df_val_0.loc[:, "Rings_pred"] = y_val_0_pred

ValueError: Input X contains NaN.
Ridge does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values