In [None]:
!pip install pytorch_tabular

In [None]:
!pip install scikit-learn==1.2.2 --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.node.config import NodeConfig
from tqdm import tqdm
import os

# Load data
train_path = "/kaggle/input/optiver/processed_train.csv"
test_path = "/kaggle/input/optiver/processed_test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Extract target and drop unused columns
y = train["target"]
train.drop(["row_id", "target", "time_id"], axis=1, inplace=True)
test.drop(["row_id", "time_id"], axis=1, inplace=True)

# Preprocess features
categorical_columns = ["stock_id"]
continuous_columns = [col for col in train.columns if col not in categorical_columns]

# Encode categorical
le = LabelEncoder()
train["stock_id"] = le.fit_transform(train["stock_id"])
test["stock_id"] = le.transform(test["stock_id"])

# Scale continuous
scaler = StandardScaler()
train[continuous_columns] = scaler.fit_transform(train[continuous_columns])
test[continuous_columns] = scaler.transform(test[continuous_columns])

# Add target back to train
train["target"] = y

# KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(train.shape[0])
test_preds = np.zeros(test.shape[0])
fold_scores = []
stats = {}

for fold, (train_idx, val_idx) in enumerate(tqdm(kfold.split(train), total=5, desc="Cross-validation Folds")):
    print(f"\nTraining Fold {fold + 1}")

    df_train = train.iloc[train_idx].copy()
    df_val = train.iloc[val_idx].copy()

    # Configs
    data_config = DataConfig(
        target=["target"],
        continuous_cols=continuous_columns,
        categorical_cols=categorical_columns,
        num_workers=4,
    )

    trainer_config = TrainerConfig(
        auto_lr_find=True,
        batch_size=256,
        max_epochs=100,
        accelerator="gpu", 
        devices=1,          
        checkpoints=None    
    )

    optimizer_config = OptimizerConfig()

    model_config = NodeConfig(
        task="regression",
        num_trees=3,
        depth=6,
        learning_rate=1e-3,
    )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )

    tabular_model.fit(train=df_train, validation=df_val)

    # Predict
    val_preds = tabular_model.predict(df_val)["target_prediction"].values
    test_fold_preds = tabular_model.predict(test)["target_prediction"].values

    oof_preds[val_idx] = val_preds
    test_preds += test_fold_preds / 5

    y_val_np = df_val["target"].values
    fold_scores.append({
        "fold": fold + 1,
        "RMSPE": np.sqrt(np.mean(np.square((y_val_np - val_preds) / y_val_np))),
        "RMSE": mean_squared_error(y_val_np, val_preds, squared=False),
        "MAE": mean_absolute_error(y_val_np, val_preds),
        "R2": r2_score(y_val_np, val_preds)
    })

    print(f"Fold {fold + 1} Metrics:")
    print(f"RMSPE: {fold_scores[-1]['RMSPE']:.6f}, RMSE: {fold_scores[-1]['RMSE']:.6f}, "
          f"MAE: {fold_scores[-1]['MAE']:.6f}, R2: {fold_scores[-1]['R2']:.4f}")

    # Load loss history
    log_path = f"./node_fold{fold+1}_outputs/lightning_logs/version_0/metrics.csv"
    if os.path.exists(log_path):
        df_log = pd.read_csv(log_path)
        if 'val_loss' in df_log:
            stats[f'fold{fold+1}_val_rmspe'] = df_log['val_loss'].dropna().tolist()
        if 'train_loss' in df_log:
            stats[f'fold{fold+1}_train_rmspe'] = df_log['train_loss'].dropna().tolist()

# Final OOF Evaluation
print("\nFinal OOF Evaluation:")
print(f"RMSPE: {np.sqrt(np.mean(np.square((y - oof_preds) / y))):.6f}")
print(f"RMSE : {mean_squared_error(y, oof_preds):.6f}")
print(f"MAE  : {mean_absolute_error(y, oof_preds):.6f}")
print(f"R2   : {r2_score(y, oof_preds):.4f}")

# Print all loss values
for key in stats:
    print(f"{key}:")
    print(stats[key])
    print()

# Plot train and val loss
for key in stats:
    if 'val' in key:
        plt.plot(stats[key], label=key, linestyle='--')
    elif 'train' in key:
        plt.plot(stats[key], label=key, linestyle='-')

plt.title('Training & Validation RMSPE (NODE)')
plt.xlabel('Epoch')
plt.ylabel('RMSPE Loss')
plt.legend()
plt.grid(True)
plt.show()