In [1]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import statistics
import re
import torch
import wandb
import gc
import pytorch_lightning as pl
import torch.multiprocessing as mp
from lightning.pytorch.tuner import Tuner
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder, TargetEncoder
import category_encoders as ce
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from functools import partial
from pytorch_tabnet.tab_model import TabNetRegressor
import warnings

warnings.filterwarnings('ignore')


In [2]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [3]:
import train_tabular_utils as tt
import cv_split_utils
import enums
from enums import ModelName
import data_utils
import param_tuning_utils as ptu
import dl_utils

In [64]:
class Config:
    RUNTIME = "LOCAL"
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "price"            
    METRIC = enums.Metrics.RMSE
    # These values are more dynamic   
    MODEL_TO_USE = ModelName.TabNetRegressor    
    TRAIN_SINGLE_FOLD = False    
    PERSIST_MODEL = False    
    USE_MANUAL_FEATURES = True
    USE_ORIGINAL_DATA = False        

COLS_TO_LEAVE = ["id", "price", "kfold", "transmission_speed", "target_grp"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
DATA_WRITEPATH = "./output/"
SUBMISSION_FILEPATH = DATA_READPATH
if Config.RUNTIME == "KAGGLE":    
    DATA_READPATH = "/kaggle/input/playground-series-s4e9/"
    if Config.USE_MANUAL_FEATURES:
        DATA_READPATH = "/kaggle/input/ps4e9-fe/"
    SUBMISSION_FILEPATH = "/kaggle/input/playground-series-s4e9/"
    DATA_WRITEPATH = "/kaggle/working/"

In [65]:
# parameters for tabnet
class TabNetConfig:
    PATIENCE = 10
    WEIGHT_DECAY = 1e-6    
    PRECISION = "16-mixed"
    BATCH_SIZE = 4096*8
    NUM_WORKERS = mp.cpu_count()
    NUM_EPOCHS = 2    
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    

class SchedulerConfig:
    # for ReduceLROnPlateau (number of epochs with no improvement after which the learning rate will be reduced)
    SCHEDULER_PATIENCE = 5  
    # for ReduceLROnPlateau (factor by which the learning rate will be reduced)
    FACTOR = 0.9 
    SCHEDULER = "ReduceLROnPlateau"
    T_0 = 10 # for CosineAnnealingWarmRestarts (Number of epochs before the first restart)
    MIN_LR = 5e-7 # for CosineAnnealingWarmRestarts (Minimum learning rate)
    T_mult = 1 # for CosineAnnealingWarmRestarts (Factor by which Ti(number of epochs between two restarts) increases)
    MAX_LR = 1e-2 # for CosineAnnealing (Initial learning rate)
    STEPS_PER_EPOCH = 13 # for OneCycleLR

In [66]:
class WandbConfig:
    WANDB_KEY = "c5e2877bf080e6b62fcc57231c91e3a1455f97d0"
    WANDB_RUN_NAME = "tabnet_cv_5folds"
    WANDB_PROJECT = "ps4e9_nn"
    USE_WANDB = False        

In [67]:
def config_to_dict(cfg):
    # dir is an inbuilt python function that returns the list of attributes and methods of any object
    return dict((name, getattr(cfg, name)) for name in dir(cfg) if not name.startswith('__'))

In [68]:
config_dict = config_to_dict(Config)
tabnet_config_dict = config_to_dict(TabNetConfig)
schd_config_dict = config_to_dict(SchedulerConfig)
wandb_config_dict = config_to_dict(WandbConfig)
merged_config_dict = {**config_dict, **tabnet_config_dict, **schd_config_dict, **wandb_config_dict}

In [69]:
if Config.USE_MANUAL_FEATURES and not Config.USE_ORIGINAL_DATA:
    df_train = pd.read_csv(DATA_READPATH + "train_preprocessed.csv")
    df_test = pd.read_csv(DATA_READPATH + "test_preprocessed.csv")        
    # remove rows where price > 2000000
    # df_train = df_train[df_train['price'] <= 2000000]
elif Config.USE_MANUAL_FEATURES and Config.USE_ORIGINAL_DATA:
    df_train = pd.read_csv(DATA_READPATH + "train_withorig_preprocessed.csv")
    df_test = pd.read_csv(DATA_READPATH + "test_withorig_preprocessed.csv")
else:
    df_train = pd.read_csv(DATA_READPATH + "train.csv")
    df_test = pd.read_csv(DATA_READPATH + "test.csv")
# keep a copy of original train and test data for later use
df_train_orig = df_train.copy()
df_test_orig = df_test.copy()
# drop id column
df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [70]:
df_train = cv_split_utils.strat_kfold_dataframe(df_train, 
                                                random_state=Config.RANDOM_SEED, 
                                                num_folds=Config.NUM_FOLDS,
                                                target_col_name=Config.TARGET_COL_NAME, 
                                                n_bins=25)

In [71]:
# do not include 'id' column in the list of int columns
int_cols = [col for col in df_train.columns if df_train[col].dtypes == 'int64' and col not in COLS_TO_LEAVE]
float_cols = [col for col in df_train.columns if df_train[col].dtypes == 'float64']
bool_cols = [col for col in df_train.columns if df_train[col].dtypes == 'bool']
cat_cols = [col for col in df_train.columns if df_train[col].dtypes == 'object' and col not in COLS_TO_LEAVE]
feature_cols = [x for x in df_train.columns if x not in COLS_TO_LEAVE]
cat_idxs = [ i for i, f in enumerate(feature_cols) if f in cat_cols]
print(f"feature_cols = {feature_cols}")
print(f"cat_cols = {cat_cols}")
print(f"cat_idxs = {cat_idxs}")
print(f"int_cols = {int_cols}")
print(f"float_cols = {float_cols}")

feature_cols = ['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title', 'horsepower', 'capacity', 'cylinders', 'fuel', 'turbo', 'hybrid', 'transmission_type', 'age']
cat_cols = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title', 'fuel', 'transmission_type']
cat_idxs = [0, 1, 4, 5, 6, 7, 8, 9, 10, 14, 17]
int_cols = ['model_year', 'milage', 'age']
float_cols = ['horsepower', 'capacity', 'cylinders', 'transmission_speed']


In [72]:
imputation_config = {
        'horsepower': SimpleImputer(strategy="median"),
        'capacity': SimpleImputer(strategy="median"),
        'cylinders': SimpleImputer(strategy="median"),
        'transmission_speed': SimpleImputer(strategy="median"),
    }
for column, imputer in imputation_config.items():
    imputer.fit(df_train[[column]])
    df_train[column] = imputer.transform(df_train[[column]])
    if column != 'horsepower':
        # convert column datatype to int
        df_train[column] = df_train[column].round().astype(int)

In [73]:
# Handle categorical columns
cat_cols = df_train.select_dtypes(include=['object']).columns
cat_cols_dims = {}
for col in cat_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    cat_cols_dims[col] = len(le.classes_)

In [74]:
# Scale numerical features
scaler = StandardScaler()
num_cols = int_cols + float_cols
df_train[num_cols] = scaler.fit_transform(df_train[num_cols])

In [75]:
df_train["turbo"] = df_train["turbo"].astype(int)
df_train["hybrid"] = df_train["hybrid"].astype(int)

In [76]:
def get_fold_dataloaders(fold, df, feature_cols, target_col_name):
    df_train = df[df["kfold"] != fold].reset_index(drop=True)
    df_val = df[df["kfold"] == fold].reset_index(drop=True)
    train_X = df_train.loc[:, feature_cols].values
    train_y = df_train.loc[:, target_col_name].values
    val_X = df_val.loc[:, feature_cols].values
    val_y = df_val.loc[:, target_col_name].values
    ds_train = TensorDataset(torch.Tensor(train_X), torch.Tensor(train_y))
    ds_val = TensorDataset(torch.Tensor(val_X), torch.Tensor(val_y))    
    dl_train = DataLoader(ds_train, batch_size=TabNetConfig.BATCH_SIZE, shuffle=True, num_workers=TabNetConfig.NUM_WORKERS)
    dl_val = DataLoader(ds_val, batch_size=TabNetConfig.BATCH_SIZE, shuffle=False, num_workers=TabNetConfig.NUM_WORKERS)    
    return dl_train, dl_val

In [77]:
# Define the PyTorch Lightning Module
class TabNetLitModel(pl.LightningModule):
    def __init__(self, cat_idxs, cat_dims, device):
        super(TabNetLitModel, self).__init__()
        self.model = TabNetRegressor(
            n_d=64, n_a=64, n_steps=5,
            gamma=1.5, n_independent=2, n_shared=2,
            cat_idxs=cat_idxs, cat_dims=cat_dims,
            lambda_sparse=1e-3, momentum=0.3, clip_value=2.,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=5e-2),
            scheduler_params=dict(mode="min",
                                patience=5,
                                min_lr=1e-5,
                                factor=0.9),
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            mask_type='sparsemax',
            device_name=device        
        )
        self.criterion = torch.nn.MSELoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_pred = self.model.predict(X)
        loss = self.criterion(y_pred, y)
        rmse = torch.sqrt(loss)        
        # Log loss and RMSE to Weights & Biases
        self.log("train_loss", loss, on_step=True, on_epoch=True, logger=True)
        self.log("train_rmse", rmse, on_step=True, on_epoch=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_pred = self.model.predict(X)
        loss = self.criterion(y_pred, y)
        rmse = torch.sqrt(loss)
        current_lr = self.trainer.optimizers[0].param_groups[0]['lr']        
        # Log validation metrics
        self.log("val_loss", loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        self.log("val_rmse", rmse, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        self.log("cur_lr", current_lr, prog_bar=True, on_step=True, on_epoch=True, logger=True)
        return {"val_loss": loss, "val_rmse": rmse}

    def configure_optimizers(self):
         # We extract the model's parameters to provide them to the optimizer
        optimizer = torch.optim.Adam(self.model.parameters(), lr=5e-2)
        #optimizer = torch.optim.Adam(self.parameters(), lr=5e-2)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=5, factor=0.9, min_lr=1e-5)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

In [78]:
def run_training(fold, dl_train, dl_val, find_lr=True):
    try:
        fold_str = f"fold{fold}"
        print(f"Running training for {fold_str}")
        logger = None
        if WandbConfig.USE_WANDB:                
            logger = dl_utils.get_wandb_logger(fold, merged_config_dict)
        print("Instantiated wandb logger")    
        chkpt_file_name = "best_model_{epoch}_{val_loss:.4f}"                
        early_stopping_callback = EarlyStopping(monitor="val_loss", patience=TabNetConfig.PATIENCE, mode="min", verbose=True)
        if fold is not None:       
            chkpt_file_name = fold_str + "_" + chkpt_file_name
        tabnet_model = TabNetLitModel(
            cat_idxs=cat_idxs, 
            cat_dims=list(cat_cols_dims.values()), 
            device=TabNetConfig.DEVICE
        )    
        loss_chkpt_callback = ModelCheckpoint(dirpath="./model", verbose=True, monitor="val_loss", mode="min", filename=chkpt_file_name)
        rmse_chkpt_callback = dl_utils.MetricsAggCallback(metric_to_monitor="val_rmse", mode="min")
        trainer = pl.Trainer(
            devices="auto",
            accelerator="gpu",
            # For results reproducibility 
            deterministic=True,
            strategy="auto",
            log_every_n_steps=TabNetConfig.LOG_EVERY_N_STEPS,
            max_epochs=TabNetConfig.NUM_EPOCHS,        
            precision=TabNetConfig.PRECISION,   
            enable_model_summary=True,
            enable_progress_bar=True,                        
            logger=logger,
            fast_dev_run=False,            
            callbacks=[loss_chkpt_callback, rmse_chkpt_callback, early_stopping_callback]
        )
        tuner = Tuner(trainer)
        
        if find_lr:
            lr_finder = tuner.lr_find(model=tabnet_model, train_dataloaders=dl_train)
            # Results can be found in
            print(lr_finder.results)
            # Results can be plotted to identify the optimal learning rate
            fig = lr_finder.plot(suggest=True)
            fig.show()
            # Pick the suggested learning rate
            new_lr = lr_finder.suggestion()
            print(f"new_lr = {new_lr}")

        trainer.fit(tabnet_model, train_dataloaders=dl_train, val_dataloaders=dl_val)                
        loss = loss_chkpt_callback.best_model_score.detach().cpu().item()
        rmse = rmse_chkpt_callback.best_metric
        print(f"Loss for {fold_str} = {loss}, rmse = {rmse}")
        del trainer, tuner, tabnet_model, early_stopping_callback, rmse_chkpt_callback, loss_chkpt_callback
        return loss, rmse
    except KeyboardInterrupt as e:
        wandb.finish(exit_code=-1, quiet=True)
        print("Marked the wandb run as failed")
    finally:
        gc.collect()
        torch.cuda.empty_cache()
        wandb.finish()

In [None]:
# fold_loss = []
# fold_rmse = []

# for fold in range(Config.NUM_FOLDS):
#     dl_train, dl_val = get_fold_dataloaders(fold, df_train, feature_cols, Config.TARGET_COL_NAME)
#     loss, rmse = run_training(fold, dl_train, dl_val, find_lr=False)
#     fold_loss.append(loss)
#     fold_rmse.append(rmse)
#     break     

In [None]:
# # initialize wandb
# wandb.init(project=WandbConfig.WANDB_PROJECT, name=WandbConfig.WANDB_RUN_NAME)
# wandb.login(key=WandbConfig.WANDB_KEY)

In [21]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 2. Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X = df_train[feature_cols].values
y = df_train[Config.TARGET_COL_NAME].values.reshape(-1, 1)

# 3. Model training and evaluation
rmse_scores = []
r2_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    print(f"Fold {fold}")
    
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # class WandbCallback:
    #     def set_trainer(self, trainer):
    #         pass  # Added to avoid the AttributeError
        
    #     def __call__(self, step, loss, y_true, y_pred, metrics):
    #         wandb.log({
    #             f"fold_{fold}/train_loss": loss,
    #             f"fold_{fold}/val_rmse": metrics["rmse"],
    #             "step": step
    #         })

    # By default, PyTorch TabNet uses:cat_emb_dim = min(50, (cat_dim + 1) // 2) if cat_emb_dim is not specified.    
    model = TabNetRegressor(
        n_d=64, n_a=64, n_steps=5,
        gamma=1.5, n_independent=2, n_shared=2,
        cat_idxs=cat_idxs, cat_dims=list(cat_cols_dims.values()),
        lambda_sparse=1e-3, momentum=0.3, clip_value=2.,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=SchedulerConfig.MAX_LR, weight_decay=TabNetConfig.WEIGHT_DECAY),
        # scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        # scheduler_params=dict(mode="min",
        #                       patience=SchedulerConfig.SCHEDULER_PATIENCE,
        #                       min_lr=SchedulerConfig.MIN_LR,
        #                       factor=SchedulerConfig.FACTOR),
        scheduler_fn=torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
        scheduler_params=dict(T_0=SchedulerConfig.T_0,
                              T_mult=SchedulerConfig.T_mult,
                              eta_min=SchedulerConfig.MIN_LR),
        mask_type='sparsemax',
        device_name=TabNetConfig.DEVICE        
    )
    
    # Train the model
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        max_epochs=TabNetConfig.NUM_EPOCHS,
        patience=TabNetConfig.PATIENCE,
        batch_size=TabNetConfig.BATCH_SIZE,
        virtual_batch_size=128,
        num_workers=TabNetConfig.NUM_WORKERS,
        drop_last=False,
        eval_metric=["rmse"],
        #callbacks=[WandbCallback()]
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))        
    rmse_scores.append(rmse)        
    print(f"RMSE: {rmse:.4f}")    
    print()
    # # Log final fold results to wandb
    # wandb.log({
    #     f"fold_{fold}/final_rmse": rmse
    # })

# Print and log average scores
avg_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)
print(f"Average RMSE: {avg_rmse:.4f} (+/- {std_rmse:.4f})")

# wandb.log({
#     "average_rmse": avg_rmse,
#     "std_rmse": std_rmse
# })

# # Finish the wandb run
# wandb.finish()

Fold 1


AttributeError: 'WandbCallback' object has no attribute 'on_train_begin'