In [39]:
import sys
import os
import numpy as np
import pandas as pd
import torch
import re
import torch.nn as nn
import torch.optim as optim
import wandb
import torch.multiprocessing as mp
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder, TargetEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from functools import partial
from pytorch_tabnet.tab_model import TabNetRegressor
import warnings

warnings.filterwarnings('ignore')


In [40]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [41]:
import train_tabular_utils as tt
import cv_split_utils
import enums
from enums import ModelName
import data_utils
import param_tuning_utils as ptu

In [42]:
class Config:
    RUNTIME = "LOCAL"
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "price"            
    METRIC = enums.Metrics.RMSE
    # These values are more dynamic   
    MODEL_TO_USE = ModelName.NeuralNet    
    TRAIN_SINGLE_FOLD = True    
    PERSIST_MODEL = False    
    USE_MANUAL_FEATURES = False
    USE_ORIGINAL_DATA = True

COLS_TO_LEAVE = ["id", "price", "kfold", "transmission_speed", "target_grp"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
DATA_WRITEPATH = "./output/"
SUBMISSION_FILEPATH = DATA_READPATH
if Config.RUNTIME == "KAGGLE":    
    DATA_READPATH = "/kaggle/input/playground-series-s4e9/"
    if Config.USE_MANUAL_FEATURES:
        DATA_READPATH = "/kaggle/input/ps4e9-fe/"
    SUBMISSION_FILEPATH = "/kaggle/input/playground-series-s4e9/"
    DATA_WRITEPATH = "/kaggle/working/"

AttributeError: type object 'ModelName' has no attribute 'NeuralNet'

In [19]:
# parameters for tabnet
class TabNetConfig:
    PATIENCE = 10
    WEIGHT_DECAY = 1e-6    
    PRECISION = "16-mixed"
    BATCH_SIZE = 64
    NUM_WORKERS = mp.cpu_count()
    NUM_EPOCHS = 2    
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    

class SchedulerConfig:
    # for ReduceLROnPlateau (number of epochs with no improvement after which the learning rate will be reduced)
    SCHEDULER_PATIENCE = 5  
    # for ReduceLROnPlateau (factor by which the learning rate will be reduced)
    FACTOR = 0.5 
    SCHEDULER = "ReduceLROnPlateau"
    T_0 = 10 # for CosineAnnealingWarmRestarts (Number of epochs before the first restart)
    MIN_LR = 5e-7 # for CosineAnnealingWarmRestarts (Minimum learning rate)
    T_mult = 1 # for CosineAnnealingWarmRestarts (Factor by which Ti(number of epochs between two restarts) increases)
    MAX_LR = 1e-2 # for CosineAnnealing (Initial learning rate)
    STEPS_PER_EPOCH = 13 # for OneCycleLR
    STEP_SIZE = 3 # for StepLR
    GAMMA = 0.1 # for StepLR

In [20]:
class WandbConfig:
    WANDB_KEY = "c5e2877bf080e6b62fcc57231c91e3a1455f97d0"
    WANDB_RUN_NAME = "tabnet_cv_5folds"
    WANDB_PROJECT = "ps4e9_nn"
    USE_WANDB = False        

In [21]:
def config_to_dict(cfg):
    # dir is an inbuilt python function that returns the list of attributes and methods of any object
    return dict((name, getattr(cfg, name)) for name in dir(cfg) if not name.startswith('__'))

In [22]:
config_dict = config_to_dict(Config)
tabnet_config_dict = config_to_dict(TabNetConfig)
schd_config_dict = config_to_dict(SchedulerConfig)
wandb_config_dict = config_to_dict(WandbConfig)
merged_config_dict = {**config_dict, **tabnet_config_dict, **schd_config_dict, **wandb_config_dict}

In [None]:
def get_train_data():
    df_train = pd.read_csv(DATA_READPATH + "train.csv")
    if Config.USE_ORIGINAL_DATA:
        df_train_orig = pd.read_csv(DATA_READPATH + "used_cars.csv")
        df_train_orig[['milage', 'price']] = df_train_orig[['milage', 'price']].applymap(lambda x: int(re.sub("[^0-9]", "", x)))
        df_train_orig['milage'] = df_train_orig['milage'].astype('int64')
        df_train_orig['price'] = df_train_orig['price'].astype('int64')
        # add df_train_orig rows to df_train
        df_train = pd.concat([df_train, df_train_orig], axis=0, ignore_index=True)        
    return df_train

In [23]:
df_train = get_train_data()
print(f"df_train.shape: {df_train.shape}")
df_test = pd.read_csv(DATA_READPATH + "test.csv")
df_test["price"] = 0
print(f"df_test.shape: {df_test.shape}")
df_combined = pd.concat([df_train, df_test],axis=0,ignore_index=True)
print("df_combined shape:", df_combined.shape )
# keep a copy of original train and test data for later use
df_train_orig = df_train.copy()
df_test_orig = df_test.copy()
# # drop id column
# df_train = df_train.drop("id", axis=1)
# df_test = df_test.drop("id", axis=1)

df_train.shape: (188533, 13)
df_test.shape: (125690, 13)
df_combined shape: (314223, 13)


In [None]:
def extract_engine_info(engine_desc: str, brand: str) -> dict:
    # check if engine_desc is a valid string
    if not isinstance(engine_desc, str):
        return None
    engine_desc = engine_desc.lower()
    brand = brand.lower()
    # Define patterns for each attribute
    horsepower_pattern = r'(\d+(\.\d+)?\s*)hp'
    capacity_pattern = r'(\d+(\.\d+)?\s*)l'
    cylinders_pattern = r'(\d+)\s*cylinder|v(\d+)'
    fuel_pattern = r'(gasoline|diesel|flex|electric|dohc|ohv)'    
    # Extract horsepower
    horsepower_match = re.search(horsepower_pattern, engine_desc)
    horsepower = float(horsepower_match.group(1)) if horsepower_match else None
    # Extract capacity
    capacity_match = re.search(capacity_pattern, engine_desc)
    capacity = float(capacity_match.group(1)) if capacity_match else None
    # Extract cylinders
    cylinders_match = re.search(cylinders_pattern, engine_desc)    
    cylinders = int(cylinders_match.group(1) or cylinders_match.group(2)) if cylinders_match else None
    # Extract fuel type
    fuel_match = re.search(fuel_pattern, engine_desc)
    fuel = fuel_match.group(0) if fuel_match else None
    if fuel in ('dohc', 'ohv'):
        fuel = "gasoline"
    # check is fuel_type is None and brand is "Tesla" then set fuel_type to "electric"
    if fuel is None and brand == "tesla":
        fuel = "electric"
    # Extract turbo
    turbo_match = re.search(r'turbo', engine_desc)
    turbo = True if turbo_match else False
    # extract hybrid
    hybrid_match = re.search(r'hybrid', engine_desc)
    hybrid = True if hybrid_match else False
    return {
        "horsepower": horsepower, 
        "capacity": capacity,
        "cylinders": cylinders,
        "fuel": fuel,
        "turbo": turbo,
        "hybrid": hybrid
    }

In [None]:
def extract_transmission_info(transmission_desc: str) -> dict:
    if not isinstance(transmission_desc, str):
        return None
    transmission_desc = transmission_desc.lower()
    patterns = {
        "transmission_speed": r"(\d+)[\s-]speed",
        "auto": r"automatic|cvt|a/t|\sat|transmission overdrive switch",
        "manual": r"manual|m/t|\smt",
        "single_speed": r"single-speed"
    }
    transmission_info = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, transmission_desc)
        if key == "transmission_speed":
            transmission_info[key] = int(match.group(1)) if match else None
        elif key == "auto" and match:
            transmission_info["transmission_type"] = "automatic"
            break
        elif key == "manual" and match:
            transmission_info["transmission_type"] = "manual"
            break
        elif key == "single_speed" and match:
            transmission_info["transmission_type"] = "single_speed"
            break

    if "transmission_type" not in transmission_info:
        transmission_info["transmission_type"] = "automatic" if transmission_desc == "f" else "Unknown"

    return transmission_info

In [None]:
high_resale_price_brands = ['Mercedes-Benz', 'Bentley', 'Aston', 'Jaguar', 'Tesla', 'Lamborghini', 'Land', 'RAM', 
                            'Cadillac', 'Alfa', 'Ferrari', 'Porsche', 'Bugatti', 'McLaren', 'Rolls-Royce', 'Lucid', 
                            'Maserati', 'Rivian', 'Genesis']

In [None]:
def create_features_from_text_cols(df):
    # apply the "extract_engine_info" function to the 'engine' and 'brand' columns and extract each of the returned values into new columns
    df[['horsepower', 'capacity', 'cylinders', 'fuel', 'turbo', 'hybrid']] = \
        df.apply(lambda x: extract_engine_info(x['engine'], x['brand']), axis=1).apply(pd.Series)
    # apply the "extract_transmission_info" function to the 'transmission' column and extract each of the returned values into new columns
    df[['transmission_speed', 'transmission_type']] = \
        df['transmission'].apply(extract_transmission_info).apply(pd.Series)
    # by subtract model_year from current year create a new feature "age"
    df['age'] = 2024 - df['model_year']    
    # Create the new column 'is_high_resale_price_brand'
    df['is_high_resale_price_brand'] = df['brand'].isin(high_resale_price_brands)    
    return df

In [None]:
df_train = create_features_from_text_cols(df_train)
df_test = create_features_from_text_cols(df_test)

In [24]:
feature_cols = [c for c in df_combined.columns if not c in COLS_TO_LEAVE ]
num_cols = ['milage']
cat_cols = [c for c in feature_cols if not c in num_cols]
cat_idxs = [ i for i, f in enumerate(feature_cols) if f in cat_cols]
print("Feature columns:", feature_cols)
print("Categorical features:", cat_cols )
print("Numerical features:", num_cols)

Feature columns: ['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
Categorical features: ['brand', 'model', 'model_year', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
Numerical features: ['milage']


In [25]:
data_utils.get_col_stats(df_combined)

Unnamed: 0,col_name,datatype,null_count,unique_categories
0,id,int64,0,0
1,brand,object,0,57
2,model,object,0,1898
3,model_year,int64,0,0
4,milage,int64,0,0
5,fuel_type,object,8466,8
6,engine,object,0,1118
7,transmission,object,0,52
8,ext_col,object,0,319
9,int_col,object,0,156


In [26]:
# Standardize numerical features
print("STANDARDIZING: ",end="")
for col in num_cols:
    print(col, ", ", end="")
    scaler = StandardScaler()
    df_combined[[col]] = scaler.fit_transform(df_combined[[col]])
    # after standardization the mean of the numerical column is 0, so filling missing values with 0 instead of mean
    df_combined[col].fillna(0)

STANDARDIZING: milage , 

In [27]:
df_combined.head(1)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,2.945022,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200


In [28]:
# Label encode categorical features
print("LABEL ENCODING: ")
# number of unique categories in each categorical column
cat_cols_dims = {}
# categorical embedding size for each categorical column
cat_cols_emb = {}
val_cnt = None
col_rare_cat = {}
for col in cat_cols:    
    le = LabelEncoder()
    df_combined[col] = le.fit_transform(df_combined[col])
    col_num_cat = len(le.classes_)
    col_min = df_combined[col].min()
    col_max = df_combined[col].max()
    val_cnt = df_combined[col].value_counts()
    col_rare_cat[col] = val_cnt.loc[val_cnt < 40].index.values
    # Increment the encoding by 1 as 0 will be used for rare categories
    df_combined[col] += 1
    # Replace rare categories in the column with 0
    df_combined.loc[df_combined[col].isin(col_rare_cat[col]), col] = 0
    # add one for rare categories
    col_num_cat = (col_max+1)+1    
    cat_cols_dims[col] = col_num_cat
    cat_cols_emb[col] = int(np.ceil(np.sqrt(col_num_cat)))
    print(f'{col}: col_num_cat={col_num_cat}, min={col_min}, max={col_max}, num_rare_cat={len(col_rare_cat[col])},'
          f' emb_size={cat_cols_emb[col]}')    

LABEL ENCODING: 
brand: col_num_cat=58, min=0, max=56, num_rare_cat=8, emb_size=8
model: col_num_cat=1899, min=0, max=1897, num_rare_cat=551, emb_size=44
model_year: col_num_cat=37, min=0, max=35, num_rare_cat=4, emb_size=7
fuel_type: col_num_cat=9, min=0, max=7, num_rare_cat=1, emb_size=3
engine: col_num_cat=1119, min=0, max=1117, num_rare_cat=308, emb_size=34
transmission: col_num_cat=53, min=0, max=51, num_rare_cat=8, emb_size=8
ext_col: col_num_cat=320, min=0, max=318, num_rare_cat=99, emb_size=18
int_col: col_num_cat=157, min=0, max=155, num_rare_cat=48, emb_size=13
accident: col_num_cat=4, min=0, max=2, num_rare_cat=0, emb_size=2
clean_title: col_num_cat=3, min=0, max=1, num_rare_cat=0, emb_size=2


In [29]:
train = df_combined.iloc[:len(df_train)]
test = df_combined.iloc[len(df_train):]
train.shape, test.shape

((188533, 13), (125690, 13))

In [30]:
for col in cat_cols:
    # COMPARE TEST CAT VALUES TO TRAIN CAT VALUES
    A = train[col].unique()
    B = test[col].unique()
    C = np.setdiff1d(B,A)
    print(f"{col}: Test has label encodes = {C} which are not in train.")
    if len(C) > 0:
        print(f" => {len(test.loc[test[col].isin(C)])} rows" )
        
    # RELABEL UNSEEN TEST VALUES AS ZERO
    test.loc[test[col].isin(C), col] = 0 

brand: Test has label encodes = [] which are not in train.
model: Test has label encodes = [] which are not in train.
model_year: Test has label encodes = [] which are not in train.
fuel_type: Test has label encodes = [] which are not in train.
engine: Test has label encodes = [] which are not in train.
transmission: Test has label encodes = [] which are not in train.
ext_col: Test has label encodes = [] which are not in train.
int_col: Test has label encodes = [] which are not in train.
accident: Test has label encodes = [] which are not in train.
clean_title: Test has label encodes = [] which are not in train.


In [31]:
train = cv_split_utils.strat_kfold_dataframe(train, 
                                            random_state=Config.RANDOM_SEED, 
                                            num_folds=Config.NUM_FOLDS,
                                            target_col_name=Config.TARGET_COL_NAME, 
                                            n_bins=25)      

In [32]:
# # do not include 'id' column in the list of int columns
# int_cols = [col for col in df_train.columns if df_train[col].dtypes == 'int64' and col not in COLS_TO_LEAVE]
# float_cols = [col for col in df_train.columns if df_train[col].dtypes == 'float64']
# bool_cols = [col for col in df_train.columns if df_train[col].dtypes == 'bool']
# cat_cols = [col for col in df_train.columns if df_train[col].dtypes == 'object' and col not in COLS_TO_LEAVE]
# feature_cols = [x for x in df_train.columns if x not in COLS_TO_LEAVE]
# cat_idxs = [ i for i, f in enumerate(feature_cols) if f in cat_cols]
# print(f"feature_cols = {feature_cols}")
# print(f"cat_cols = {cat_cols}")
# print(f"cat_idxs = {cat_idxs}")
# print(f"int_cols = {int_cols}")
# print(f"float_cols = {float_cols}")

In [33]:
# imputation_config = {
#         'horsepower': SimpleImputer(strategy="median"),
#         'capacity': SimpleImputer(strategy="median"),
#         'cylinders': SimpleImputer(strategy="median"),
#         'transmission_speed': SimpleImputer(strategy="median"),
#     }
# for column, imputer in imputation_config.items():
#     imputer.fit(df_train[[column]])
#     df_train[column] = imputer.transform(df_train[[column]])
#     if column != 'horsepower':
#         # convert column datatype to int
#         df_train[column] = df_train[column].round().astype(int)

In [34]:
# # Handle categorical columns
# cat_cols = df_train.select_dtypes(include=['object']).columns
# cat_cols_dims = {}
# for col in cat_cols:
#     le = LabelEncoder()
#     df_train[col] = le.fit_transform(df_train[col].astype(str))
#     cat_cols_dims[col] = len(le.classes_)

In [35]:
# # Scale numerical features
# scaler = StandardScaler()
# num_cols = int_cols + float_cols
# df_train[num_cols] = scaler.fit_transform(df_train[num_cols])

In [36]:
# df_train["turbo"] = df_train["turbo"].astype(int)
# df_train["hybrid"] = df_train["hybrid"].astype(int)

In [37]:
torch.use_deterministic_algorithms(True, warn_only=True)
oof = np.zeros(len(train))
pred = np.zeros(len(test))
kf = KFold(n_splits=Config.NUM_FOLDS, random_state=Config.RANDOM_SEED, shuffle=True)

In [38]:
if Config.MODEL_TO_USE == ModelName.TabNetRegressor:
    # 2. Cross-validation setup
    X = train[feature_cols].values
    y = train[Config.TARGET_COL_NAME].values.reshape(-1, 1)
    # 3. Model training and evaluation
    rmse_scores = []
    print("Training TabNetRegressor")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print("#"*25)
        print(f"### Fold {fold+1} ###")
        print("#"*25)
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # By default, PyTorch TabNet uses:cat_emb_dim = min(50, (cat_dim + 1) // 2) if cat_emb_dim is not specified.    
        model = TabNetRegressor(
            n_d=64, n_a=64, n_steps=5,
            gamma=1.5, n_independent=2, n_shared=2,
            cat_idxs=cat_idxs, cat_dims=list(cat_cols_dims.values()),
            lambda_sparse=1e-3, momentum=0.3, clip_value=2.,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=SchedulerConfig.MAX_LR, weight_decay=TabNetConfig.WEIGHT_DECAY),
            scheduler_fn=torch.optim.lr_scheduler.StepLR,
            scheduler_params=dict(step_size=SchedulerConfig.STEP_SIZE, gamma=SchedulerConfig.GAMMA),
            # scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            # scheduler_params=dict(mode="min",
            #                       patience=SchedulerConfig.SCHEDULER_PATIENCE,
            #                       min_lr=SchedulerConfig.MIN_LR,
            #                       factor=SchedulerConfig.FACTOR),
            # scheduler_fn=torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
            # scheduler_params=dict(T_0=SchedulerConfig.T_0,
            #                       T_mult=SchedulerConfig.T_mult,
            #                       eta_min=SchedulerConfig.MIN_LR),
            mask_type='sparsemax',
            device_name=TabNetConfig.DEVICE        
        )
        
        # Train the model
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            max_epochs=TabNetConfig.NUM_EPOCHS,
            patience=TabNetConfig.PATIENCE,
            batch_size=TabNetConfig.BATCH_SIZE,
            virtual_batch_size=128,
            num_workers=TabNetConfig.NUM_WORKERS,
            drop_last=False,
            eval_metric=["rmse"]
        )
        
        # Make predictions
        y_pred = model.predict(X_val)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))        
        rmse_scores.append(rmse)        
        print(f"RMSE: {rmse:.4f}")    
        oof[val_idx] = y_pred
        print()
        if Config.TRAIN_SINGLE_FOLD:
            break
        
        # TEST PREDS    
        test_preds = model.predict(test[feature_cols].values)
        test_preds = test_preds.flatten()
        
        if fold == 0:
            pred = test_preds
        else:
            pred += test_preds

    pred /= Config.NUM_FOLDS
    # Print and log average scores
    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    print(f"Average RMSE: {avg_rmse:.4f} (+/- {std_rmse:.4f})")

#########################
### Fold 2 ###
#########################
epoch 0  | loss: 6021955065.74834| val_0_rmse: 77510.62401|  0:03:00s


KeyboardInterrupt: 

In [None]:
class EmbeddingNetwork(nn.Module):
    def __init__(self, cat_sizes, cat_emb_sizes, num_features):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(cat_size, emb_size) 
                                         for cat_size, emb_size in zip(cat_sizes, cat_emb_sizes)])
        
        total_emb_size = sum(cat_emb_sizes) + num_features
        
        self.fc_layers = nn.Sequential(
            nn.Linear(total_emb_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
    
    def forward(self, cat_inputs, num_inputs):
        emb_outputs = [emb(cat_inputs[:, i]) for i, emb in enumerate(self.embeddings)]
        emb_outputs = torch.cat(emb_outputs, dim=1)
        combined = torch.cat([emb_outputs, num_inputs], dim=1)
        return self.fc_layers(combined)

def build_model(cat_sizes, cat_emb_sizes, num_features):
    return EmbeddingNetwork(cat_sizes, cat_emb_sizes, num_features)

def rmse_metric(predictions, targets):
    return torch.sqrt(torch.mean((predictions - targets) ** 2))

def train_model(model, train_loader, valid_loader, optimizer, scheduler, criterion, eval_metric, device, epochs):
    for epoch in range(epochs):
        model.train()
        for cat_inputs, num_inputs, targets in train_loader:
            cat_inputs, num_inputs, targets = cat_inputs.to(device), num_inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(cat_inputs, num_inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        
        model.eval()
        valid_loss = 0
        valid_metric = 0
        with torch.no_grad():
            for cat_inputs, num_inputs, targets in valid_loader:
                cat_inputs, num_inputs, targets = cat_inputs.to(device), num_inputs.to(device), targets.to(device)
                outputs = model(cat_inputs, num_inputs)
                valid_loss += criterion(outputs, targets).item()
                valid_metric += eval_metric(outputs, targets).item()
        
        # Step the scheduler
        scheduler.step()
        
        # Print epoch results including the current learning rate and evaluation metric
        print(f'Epoch {epoch+1}/{epochs}, '
              f'Validation Loss: {valid_loss/len(valid_loader):.6f}, '
              f'Validation Metric: {valid_metric/len(valid_loader):.6f}, '
              f'Learning Rate: {scheduler.get_last_lr()[0]:.6f}')

In [None]:
def get_fold_dataloaders(fold, df, cat_cols, num_cols, target_col_name):
    df_train_fold = df[df["kfold"] != fold].reset_index(drop=True)
    df_val_fold = df[df["kfold"] == fold].reset_index(drop=True)
    X_train_cats = torch.LongTensor(df_train_fold.loc[:, cat_cols].values)
    X_train_nums = torch.FloatTensor(df_train_fold.loc[:, num_cols].values)
    y_train = torch.FloatTensor(df_train_fold.loc[:, target_col_name].values).unsqueeze(1)
    X_valid_cats = torch.LongTensor(df_val_fold.loc[:, cat_cols].values)
    X_valid_nums = torch.FloatTensor(df_val_fold.loc[:, num_cols].values)
    y_valid = torch.FloatTensor(df_val_fold.loc[:, target_col_name].values).unsqueeze(1)
    train_dataset = TensorDataset(X_train_cats, X_train_nums, y_train)
    valid_dataset = TensorDataset(X_valid_cats, X_valid_nums, y_valid)        
    train_loader = DataLoader(train_dataset, batch_size=TabNetConfig.BATCH_SIZE, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=TabNetConfig.BATCH_SIZE)
    return train_loader, valid_loader, y_valid

In [None]:
VER = 1
if Config.MODEL_TO_USE == ModelName.NeuralNet:
    rmse_scores = []        
    for i, (train_index, val_index) in enumerate(kf.split(train)):
        print("#"*25)
        print(f"### Fold {i+1} ###")
        print("#"*25)

        X_train_cats = torch.LongTensor(train.loc[train_index, cat_cols].values)
        X_train_nums = torch.FloatTensor(train.loc[train_index, num_cols].values)
        y_train = torch.FloatTensor(train.loc[train_index, "price"].values).unsqueeze(1)

        X_valid_cats = torch.LongTensor(train.loc[val_index, cat_cols].values)
        X_valid_nums = torch.FloatTensor(train.loc[val_index, num_cols].values)
        y_valid = torch.FloatTensor(train.loc[val_index, "price"].values).unsqueeze(1)

        X_test_cats = torch.LongTensor(test[cat_cols].values)
        X_test_nums = torch.FloatTensor(test[num_cols].values)

        train_dataset = TensorDataset(X_train_cats, X_train_nums, y_train)
        valid_dataset = TensorDataset(X_valid_cats, X_valid_nums, y_valid)
        test_dataset = TensorDataset(X_test_cats, X_test_nums)

        train_loader = DataLoader(train_dataset, batch_size=TabNetConfig.BATCH_SIZE, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=TabNetConfig.BATCH_SIZE)
        test_loader = DataLoader(test_dataset, batch_size=512)

        CAT_SIZE = list(cat_cols_dims.values())
        CAT_EMB = list(cat_cols_emb.values())
        model = build_model(CAT_SIZE, CAT_EMB, len(num_cols)).to(TabNetConfig.DEVICE)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=SchedulerConfig.STEP_SIZE, 
                                                    gamma=SchedulerConfig.GAMMA)
        criterion = nn.MSELoss()
        eval_metric = rmse_metric

        train_model(model, 
                    train_loader, valid_loader, optimizer, 
                    scheduler=scheduler,
                    criterion=criterion, 
                    device=TabNetConfig.DEVICE, 
                    epochs=TabNetConfig.NUM_EPOCHS,
                    eval_metric=eval_metric)

        torch.save(model.state_dict(), f'{DATA_WRITEPATH}/NN_v{VER}_f{i}.weights.pth')
        
        # OOF PREDS
        model.eval()
        oof_preds = []
        with torch.no_grad():
            for cat_inputs, num_inputs, _ in valid_loader:
                cat_inputs, num_inputs = cat_inputs.to(TabNetConfig.DEVICE), num_inputs.to(TabNetConfig.DEVICE)
                outputs = model(cat_inputs, num_inputs)
                oof_preds.extend(outputs.cpu().numpy().flatten())

        oof_preds = np.array(oof_preds)
        rmse = np.sqrt(np.mean((oof_preds - y_valid.numpy().flatten())**2))
        rmse_scores.append(rmse)
        print(f' => RMSE = {rmse}\n')
        oof[val_index] = oof_preds
        
        if Config.TRAIN_SINGLE_FOLD:
            break

        # TEST PREDS
        test_preds = []
        with torch.no_grad():
            for cat_inputs, num_inputs in test_loader:
                cat_inputs, num_inputs = cat_inputs.to(TabNetConfig.DEVICE), num_inputs.to(TabNetConfig.DEVICE)
                outputs = model(cat_inputs, num_inputs)
                test_preds.extend(outputs.cpu().numpy().flatten())

        test_preds = np.array(test_preds)
        if i == 0:
            pred = test_preds
        else:
            pred += test_preds

    pred /= Config.NUM_FOLDS
    # Print and log average scores
    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    print(f"Average RMSE: {avg_rmse:.4f} (+/- {std_rmse:.4f})")

In [None]:
# COMPUTE AND DISPLAY CV RSME SCORE
if not Config.TRAIN_SINGLE_FOLD:
    rmse = np.sqrt(np.mean((oof - train.price.values)**2))
    print("Overall CV RMSE =", rmse)

    # SAVE OOF 
    oof_df = train[["id"]].copy()
    oof_df["pred"] = oof
    oof_df.to_csv(f'{DATA_WRITEPATH}df_val_preds_{Config.MODEL_TO_USE}.csv', index=False)

In [None]:
if not Config.TRAIN_SINGLE_FOLD:
    sub = pd.read_csv(DATA_READPATH + "sample_submission.csv")
    sub.price = pred
    print("Submission shape:",sub.shape)
    sub.to_csv(f"{DATA_WRITEPATH}submission_{Config.MODEL_TO_USE}.csv",index=False)
    sub.head()