In [None]:
from dotenv import load_dotenv, find_dotenv
assert load_dotenv(find_dotenv(usecwd=False)), "The .env file was not loaded."

from pathlib import Path

import numpy as np
import pandas as pd
import torch
from drn import *

torch.set_num_threads(1)

In [None]:
DATA_DIR = Path("data/processed/reg")
x_train = pd.read_csv(DATA_DIR / "x_train.csv")
x_val = pd.read_csv(DATA_DIR / "x_val.csv")
y_train = pd.read_csv(DATA_DIR / "y_train.csv")
y_val = pd.read_csv(DATA_DIR / "y_val.csv")

In [None]:
MODEL_DIR = Path("models/reg")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
X_train = torch.Tensor(x_train.values)
Y_train = torch.Tensor(y_train.values)
X_val = torch.Tensor(x_val.values)
Y_val = torch.Tensor(y_val.values)

train_dataset = torch.utils.data.TensorDataset(X_train, Y_train)
val_dataset = torch.utils.data.TensorDataset(X_val, Y_val)

In [None]:
distribution = "gaussian"  # distributional assumption for the GLM, CANN, MDN

## Baseline

In [None]:
glm = GLM.from_statsmodels(X_train, Y_train, distribution=distribution)
torch.save(glm, MODEL_DIR / "glm.pkl")

In [None]:
cutpoints_DRN = drn_cutpoints(
    c_0=np.min(y_train) * 1.05 if np.min(y_train) < 0 else 0.0,
    c_K=np.max(y_train) * 1.05,
    y=y_train,
    proportion=0.01,
    min_obs=1,
)

## (a) No Regularisation

In [None]:
torch.manual_seed(23)
drn_no_penalty = DRN(
    num_features=x_train.shape[1],
    cutpoints=cutpoints_DRN,
    glm=glm,
    hidden_size=100,
    num_hidden_layers=2,
    baseline_start=False,
    dropout_rate=0.2,
)
train(
    drn_no_penalty,
    lambda pred, y: drn_loss(pred, y, kl_alpha=0, mean_alpha=0, dv_alpha=0, tv_alpha=0),
    train_dataset,
    val_dataset,
    lr=0.001,  # lr = 0.0002
    batch_size=200,  # batch_size = 50
    log_interval=1,
    patience=30,
    epochs=1000,
)
torch.save(drn_no_penalty, MODEL_DIR / "drn_no_penalty.pkl")

## (b) Small KL

In [None]:
torch.manual_seed(23)
drn_kl_penalty = DRN(
    num_features=x_train.shape[1],
    cutpoints=cutpoints_DRN,
    glm=glm,
    hidden_size=128,
    num_hidden_layers=2,
    baseline_start=False,
    dropout_rate=0.2,
)
train(
    drn_kl_penalty,
    lambda pred, y: drn_loss(
        pred, y, kl_alpha=0.001, mean_alpha=0, dv_alpha=0, tv_alpha=0  # 5e-2  # 5e-4
    ),
    train_dataset,
    val_dataset,
    lr=0.001,  # lr = 0.0002
    batch_size=200,  # batch_size = 50
    log_interval=1,
    patience=10,
    epochs=1000,
)
torch.save(drn_kl_penalty, MODEL_DIR / "drn_kl_penalty.pkl")

## (c) Excessive Smoothing

In [None]:
torch.manual_seed(23)
drn_dv_large_penalty = DRN(
    num_features=x_train.shape[1],
    cutpoints=cutpoints_DRN,
    glm=glm,
    hidden_size=128,
    num_hidden_layers=2,
    baseline_start=True,
    dropout_rate=0.2,
)
train(
    drn_dv_large_penalty,
    lambda pred, y: drn_loss(
        pred, y, kl_alpha=0, mean_alpha=0, dv_alpha=10, tv_alpha=0
    ),
    train_dataset,
    val_dataset,
    lr=0.01,
    batch_size=300,
    log_interval=1,
    patience=10,
    epochs=1000,
)
torch.save(drn_dv_large_penalty, MODEL_DIR / "drn_dv_large_penalty.pkl")

## (d) Perfect Smoothing

In [None]:
torch.manual_seed(23)
drn_everything = DRN(
    num_features=x_train.shape[1],
    cutpoints=cutpoints_DRN,
    glm=glm,
    hidden_size=128,
    num_hidden_layers=2,
    baseline_start=False,
    dropout_rate=0.2,
)
train(
    drn_everything,
    lambda pred, y: drn_loss(
        pred, y, kl_alpha=1e-3, mean_alpha=0, dv_alpha=5e-4, tv_alpha=0
    ),
    train_dataset,
    val_dataset,
    lr=0.001,
    batch_size=100,
    log_interval=1,
    patience=10,
    epochs=1000,
)
torch.save(drn_everything, MODEL_DIR / "drn_everything.pkl")