In [53]:
VERSION = 4

In [58]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv("data/train.csv") 
print("Train shape:", train.shape )
TRAIN_LN = len(train)
display( train.head(1) )

test_df = pd.read_csv("data/test.csv") 
test_df['price'] = 0 # Doing this for common pre-processing steps.
print("Test shape:", test_df.shape )
display( test_df.head(1) )

Train shape: (188533, 13)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200


Test shape: (125690, 13)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes,0


In [59]:
CATS = [c for c in train.columns if not c in ["id","price"] ]
NUMS = ['milage']
CATS = [c for c in CATS if not c in NUMS]

In [60]:
test_df.drop(['price'], axis=1, inplace=True) # doing this for running inference on test data and fastai expects the column to not be present in the test data.

In [73]:
from fastai.tabular.all import *
from sklearn.model_selection import KFold
import torch

NUM_FOLDS = 5
RANDOM_STATE = 42
MAX_EPOCHS = 20
MODEL_CKPTS = f"checkpoints/version_{VERSION}"

# Check if MPS (Metal Performance Shaders) is available - MPS is only available on Macs with Apple Silicon.
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_STATE)
early_stop_callback = EarlyStoppingCallback(monitor='valid_loss', patience=3)
fold_rmse = []

test_predictions = np.zeros(len(test_df))

for fold, (train_idx, val_idx) in enumerate(kf.split(train), 1):
    val_fold = train.iloc[val_idx]
    dls_fold = TabularDataLoaders.from_df(train, y_names="price",
                                          cat_names=CATS,
                                          cont_names=NUMS,
                                          procs=[Categorify, FillMissing, Normalize],
                                          valid_idx=val_idx,
                                          device=device)
    
    learn = tabular_learner(dls_fold, metrics=rmse)
    learn.to(device)
    
    # TODO: from fastai.callback.tensorboard import TensorBoardCallback
    # TENSORBOARD_LOG_DIR=f"tensorboard_logs/version_{VERSION}/fold_{fold}"
    # tensorboard_callback = TensorBoardCallback(log_dir=TENSORBOARD_LOG_DIR)
    # learn.fit_one_cycle(EPOCHS, cbs=[tensorboard_callback]) 
    # Don't use tensorboard callback for some reason, it is throwing an error, fix this later.
    learn.fit_one_cycle(MAX_EPOCHS, cbs=[early_stop_callback])
    learn.save(MODEL_CKPTS+f"/fold_{fold}.pth")
    
    val_dl = learn.dls.test_dl(val_fold)
    preds, _ = learn.get_preds(dl=val_dl)
    fold_rmse.append(rmse(
        torch.tensor(preds.cpu().numpy() if isinstance(preds, torch.Tensor) else preds),
        torch.tensor(val_fold['price'].values)
    ))
    
    print(f"Fold {fold} RMSE: {fold_rmse[-1].item():.2f}")

    # Get test predictions for each fold
    test_dl = learn.dls.test_dl(test_df)
    fold_test_predictions = learn.get_preds(dl=test_dl)
    test_predictions += fold_test_predictions[0].cpu().numpy().reshape(-1,)

# Average the predictions across all folds
test_predictions /= NUM_FOLDS

print(f"\nMean RMSE across folds: {np.mean(fold_rmse):.2f}")
print(f"Standard deviation of RMSE: {np.std(fold_rmse):.2f}")

Fold 2


epoch,train_loss,valid_loss,_rmse,time
0,8303525888.0,7514689536.0,86687.320312,00:42
1,4832985600.0,7503738880.0,86624.125,00:42
2,8703479808.0,7456662016.0,86351.953125,00:42
3,8098556416.0,7304275968.0,85465.054688,00:42
4,5334394368.0,7003043840.0,83684.171875,00:42
5,8833270784.0,6643122688.0,81505.34375,00:42
6,6041159680.0,6180436992.0,78615.75,00:42
7,5640912384.0,5816144896.0,76263.664062,00:42
8,6184829440.0,5495723008.0,74133.148438,00:42
9,7196210688.0,5181336576.0,71981.492188,00:42


Fold 2 RMSE: 69027.81


Fold 3


epoch,train_loss,valid_loss,_rmse,time
0,10582999040.0,8249888256.0,90828.898438,00:42
1,5151597568.0,8239401984.0,90771.148438,00:42
2,6192506880.0,8192693760.0,90513.515625,00:42
3,7603979776.0,8058383360.0,89768.484375,00:40
4,6810397696.0,7820807680.0,88435.34375,00:41
5,6535215104.0,7373929472.0,85871.59375,00:41
6,4898815488.0,6950466048.0,83369.421875,00:41
7,9345357824.0,6644310528.0,81512.65625,00:41
8,2853265664.0,6648114176.0,81535.960938,00:42
9,6389785088.0,6588287488.0,81168.265625,00:42


No improvement since epoch 14: early stopping


Fold 3 RMSE: 76060.12


Fold 4


epoch,train_loss,valid_loss,_rmse,time
0,9188651008.0,8741425152.0,93495.570312,00:42
1,7423348736.0,8731347968.0,93441.695312,00:43
2,10728170496.0,8681977856.0,93177.140625,00:43
3,6817683456.0,8544870912.0,92438.476562,00:42
4,3974677248.0,8243345920.0,90792.890625,00:42
5,8314019328.0,7858260992.0,88646.828125,00:42
6,7333822976.0,7426873344.0,86179.304688,00:42
7,4885980160.0,7104579072.0,84288.679688,00:42
8,5222077952.0,6593533440.0,81200.546875,00:42
9,5595426816.0,6283907584.0,79271.101562,00:42


No improvement since epoch 15: early stopping


Fold 4 RMSE: 76904.29


Fold 5


epoch,train_loss,valid_loss,_rmse,time
0,5911291392.0,8694432768.0,93243.984375,00:42
1,5689995776.0,8684393472.0,93190.101562,00:43
2,5011401216.0,8639185920.0,92947.21875,00:42
3,4994456064.0,8490302464.0,92142.84375,00:42
4,7181341696.0,8205058048.0,90581.742188,00:42
5,6126220288.0,7861803520.0,88666.835938,00:43
6,8431433728.0,7434397184.0,86222.953125,00:43
7,5685537280.0,6993481728.0,83627.054688,00:42
8,5328416768.0,6717258240.0,81958.898438,00:42
9,6405854208.0,6564970496.0,81024.5,00:42


No improvement since epoch 13: early stopping


Fold 5 RMSE: 78163.42



Mean RMSE across folds: 73803.54
Standard deviation of RMSE: 4023.47


In [74]:
test_df['price'] = test_predictions
test_df[['id','price']].to_csv(f'predictions/submission_v{VERSION}.csv',index=False)