In [None]:
from dotenv import find_dotenv, load_dotenv

assert load_dotenv(find_dotenv(usecwd=False)), "The .env file was not loaded."

from pathlib import Path

import drn
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from generate_synthetic_dataset import generate_synthetic_gamma_lognormal, generate_synthetic_gaussian

In [None]:
def split_data(
    features: pd.DataFrame,
    target: pd.Series,
    seed: int = 42,
    train_size: float = 0.6,
    val_size: float = 0.2,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Series]:
    """
    Split features and target into train, validation, and test sets based on fractions of the entire dataset.

    Args:
        features: DataFrame of predictors.
        target: Series of labels.
        seed: Random seed for reproducibility.
        train_size: Fraction of data for training.
        val_size: Fraction of data for validation.
            (test_size is computed as 1 - train_size - val_size)
    Returns:
        x_train_raw, x_val_raw, x_test_raw,
        y_train, y_val, y_test
    """
    # Compute test fraction
    test_size = 1.0 - train_size - val_size
    if test_size <= 0:
        raise ValueError(
            f"train_size + val_size must be < 1. Got {train_size + val_size}"
        )

    # Split off test set
    x_train_val, x_test_raw, y_train_val, y_test = train_test_split(
        features, target, test_size=test_size, random_state=seed, shuffle=True
    )
    # Split train+val into train and val
    relative_val_size = val_size / (train_size + val_size)
    x_train_raw, x_val_raw, y_train, y_val = train_test_split(
        x_train_val,
        y_train_val,
        test_size=relative_val_size,
        random_state=seed,
        shuffle=True,
    )
    return x_train_raw, x_val_raw, x_test_raw, y_train, y_val, y_test


def generate_categories(
    x_train_raw: pd.DataFrame,
    x_val_raw: pd.DataFrame,
    x_test: pd.DataFrame,
    cat_features: list[str],
) -> dict[str, list]:
    """
    Create a mapping of categorical features to their full category lists:
      - Initialize from training split
      - Detect new categories in val/test, print a warning, and extend
    Returns:
        all_categories: feature -> sorted list of categories
    """
    all_categories = {
        feature: sorted(x_train_raw[feature].dropna().unique())
        for feature in cat_features
    }
    for split_name, df in [("validation", x_val_raw), ("test", x_test)]:
        for feature in cat_features:
            seen = set(all_categories[feature])
            unique_vals = set(df[feature].dropna().unique())
            new_vals = unique_vals - seen
            if new_vals:
                print(
                    f"New categories for '{feature}' in {split_name} split: {new_vals}"
                )
                all_categories[feature] = sorted(
                    all_categories[feature] + list(new_vals)
                )
    return all_categories


def preprocess_data(
    x_train_raw: pd.DataFrame,
    x_val_raw: pd.DataFrame,
    x_test_raw: pd.DataFrame,
    num_features: list[str],
    cat_features: list[str],
    num_standard: bool = True,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, ColumnTransformer, dict[str, list]
]:
    """
    Fit a ColumnTransformer on x_train_raw and transform raw splits.
    - Numeric features are optionally standardized.
    - Categorical features are one-hot encoded, using full categories detected from splits.

    Returns:
        x_train, x_val, x_test, fitted ColumnTransformer, all_categories mapping
    """
    # Prepare category mapping
    all_categories = generate_categories(
        x_train_raw, x_val_raw, x_test_raw, cat_features
    )

    # OneHotEncoder with fixed categories
    ohe = OneHotEncoder(
        categories=[all_categories[f] for f in cat_features],
        handle_unknown="error",
        sparse_output=False,
    )

    # Build transformers list
    transformers = [
        ("num", StandardScaler() if num_standard else "passthrough", num_features),
        ("cat", ohe, cat_features),
    ]
    ct = ColumnTransformer(
        transformers=transformers, remainder="drop", verbose_feature_names_out=False
    )

    # Fit & transform splits
    x_train_arr = ct.fit_transform(x_train_raw)
    x_val_arr = ct.transform(x_val_raw)
    x_test_arr = ct.transform(x_test_raw)

    # Build DataFrames with proper feature names
    feature_names = ct.get_feature_names_out()
    x_train = pd.DataFrame(x_train_arr, columns=feature_names)
    x_val = pd.DataFrame(x_val_arr, columns=feature_names)
    x_test = pd.DataFrame(x_test_arr, columns=feature_names)

    return x_train, x_val, x_test, ct, all_categories

# Synthetic dataset

In [None]:
features, target, means, dispersion = generate_synthetic_gamma_lognormal(20000, seed=1)

In [None]:
x_train_raw, x_val_raw, x_test_raw, y_train_raw, y_val_raw, y_test_raw = split_data(
    features,
    target,
    seed=42,
)

In [None]:
# The following is to generate a larger test dataset (since it is just cheap synthetic data).
# May want to harmonise this with the original generation process, though it will change the train/val/test sets.
features_new, target_new, _, _ = generate_synthetic_gamma_lognormal(50000, seed=0)
processed = drn.split_and_preprocess(
    features_new, target_new, ["X_1", "X_2"], [], seed=102, num_standard=False
)
x_test_raw, y_test_raw = processed[2], processed[5]

In [None]:
SPLIT_DATA_DIR = Path("data/interim/synth")
SPLIT_DATA_DIR.mkdir(parents=True, exist_ok=True)

x_train_raw.to_csv(SPLIT_DATA_DIR / "x_train.csv", index=False)
x_val_raw.to_csv(SPLIT_DATA_DIR / "x_val.csv", index=False)
x_test_raw.to_csv(SPLIT_DATA_DIR / "x_test.csv", index=False)
y_train_raw.to_csv(SPLIT_DATA_DIR / "y_train.csv", index=False)
y_val_raw.to_csv(SPLIT_DATA_DIR / "y_val.csv", index=False)
y_test_raw.to_csv(SPLIT_DATA_DIR / "y_test.csv", index=False)

In [None]:
# Load them back up from the files
x_train_raw = pd.read_csv(SPLIT_DATA_DIR / "x_train.csv")
x_val_raw = pd.read_csv(SPLIT_DATA_DIR / "x_val.csv")
x_test_raw = pd.read_csv(SPLIT_DATA_DIR / "x_test.csv")
y_train_raw = pd.read_csv(SPLIT_DATA_DIR / "y_train.csv")
y_val_raw = pd.read_csv(SPLIT_DATA_DIR / "y_val.csv")
y_test_raw = pd.read_csv(SPLIT_DATA_DIR / "y_test.csv")

In [None]:
x_train, x_val, x_test, ct, all_categories = preprocess_data(
    x_train_raw,
    x_val_raw,
    x_test_raw,
    num_features=["X_1", "X_2"],
    cat_features=[],
    num_standard=False,
)

# Not doing anything extra to the target variable
y_train = y_train_raw
y_val = y_val_raw
y_test = y_test_raw

In [None]:
PROCESSED_DATA_DIR = Path("data/processed/synth")
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

x_train.to_csv(PROCESSED_DATA_DIR / "x_train.csv", index=False)
x_val.to_csv(PROCESSED_DATA_DIR / "x_val.csv", index=False)
x_test.to_csv(PROCESSED_DATA_DIR / "x_test.csv", index=False)
y_train.to_csv(PROCESSED_DATA_DIR / "y_train.csv", index=False)
y_val.to_csv(PROCESSED_DATA_DIR / "y_val.csv", index=False)
y_test.to_csv(PROCESSED_DATA_DIR / "y_test.csv", index=False)

In [None]:
x_train_raw, x_val_raw, x_test_raw, y_train_raw, y_val_raw, y_test_raw = split_data(
    features,
    target,
    seed=42,
)

In [None]:
# The following is to generate a larger test dataset (since it is just cheap synthetic data).
# May want to harmonise this with the original generation process, though it will change the train/val/test sets.
features_new, target_new, _, _ = generate_synthetic_gamma_lognormal(50000, seed=0)
processed = drn.split_and_preprocess(
    features_new, target_new, ["X_1", "X_2"], [], seed=102, num_standard=False
)
x_test_raw, y_test_raw = processed[2], processed[5]

In [None]:
SPLIT_DATA_DIR = Path("data/interim/synth")
SPLIT_DATA_DIR.mkdir(parents=True, exist_ok=True)

x_train_raw.to_csv(SPLIT_DATA_DIR / "x_train.csv", index=False)
x_val_raw.to_csv(SPLIT_DATA_DIR / "x_val.csv", index=False)
x_test_raw.to_csv(SPLIT_DATA_DIR / "x_test.csv", index=False)
y_train_raw.to_csv(SPLIT_DATA_DIR / "y_train.csv", index=False)
y_val_raw.to_csv(SPLIT_DATA_DIR / "y_val.csv", index=False)
y_test_raw.to_csv(SPLIT_DATA_DIR / "y_test.csv", index=False)

In [None]:
# Load them back up from the files
x_train_raw = pd.read_csv(SPLIT_DATA_DIR / "x_train.csv")
x_val_raw = pd.read_csv(SPLIT_DATA_DIR / "x_val.csv")
x_test_raw = pd.read_csv(SPLIT_DATA_DIR / "x_test.csv")
y_train_raw = pd.read_csv(SPLIT_DATA_DIR / "y_train.csv")
y_val_raw = pd.read_csv(SPLIT_DATA_DIR / "y_val.csv")
y_test_raw = pd.read_csv(SPLIT_DATA_DIR / "y_test.csv")

In [None]:
x_train, x_val, x_test, ct, all_categories = preprocess_data(
    x_train_raw,
    x_val_raw,
    x_test_raw,
    num_features=["X_1", "X_2"],
    cat_features=[],
    num_standard=False,
)

# Not doing anything extra to the target variable
y_train = y_train_raw
y_val = y_val_raw
y_test = y_test_raw

In [None]:
PROCESSED_DATA_DIR = Path("data/processed/synth")
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

x_train.to_csv(PROCESSED_DATA_DIR / "x_train.csv", index=False)
x_val.to_csv(PROCESSED_DATA_DIR / "x_val.csv", index=False)
x_test.to_csv(PROCESSED_DATA_DIR / "x_test.csv", index=False)
y_train.to_csv(PROCESSED_DATA_DIR / "y_train.csv", index=False)
y_val.to_csv(PROCESSED_DATA_DIR / "y_val.csv", index=False)
y_test.to_csv(PROCESSED_DATA_DIR / "y_test.csv", index=False)

# Real dataset

In [None]:
df = pd.read_csv("data/raw/freMPL1.csv")

claims = df.loc[df["ClaimAmount"] > 0, :]

# Scaling
target = claims["ClaimAmount"] / 1000
features = claims.drop("ClaimAmount", axis=1)
features = features.drop(
    ["RecordBeg", "RecordEnd", "ClaimInd", "Garage"], axis=1
)  # Drop garage due to missing values

# Convert "VehAge" categories to numeric
features["VehAge"] = features["VehAge"].map(
    {
        "0": 0,
        "1": 1,
        "2": 2,
        "3": 3,
        "4": 4,
        "5": 5,
        "6-7": 6,
        "8-9": 8,
        "10+": 11,
    }
)

speed_ranges = [speed for speed in np.unique(features["VehMaxSpeed"])]
speed_series = pd.Series(speed_ranges)
mapping = {speed_range: i + 1 for i, speed_range in enumerate(speed_ranges)}
features["VehMaxSpeed"] = features["VehMaxSpeed"].map(mapping)
features["SocioCateg"] = features["SocioCateg"].str.extract("(\d+)").astype(int)

In [None]:
x_train_raw, x_val_raw, x_test_raw, y_train_raw, y_val_raw, y_test_raw = split_data(
    features,
    target,
    seed=0,
)

In [None]:
SPLIT_DATA_DIR = Path("data/interim/real")
SPLIT_DATA_DIR.mkdir(parents=True, exist_ok=True)

x_train_raw.to_csv(SPLIT_DATA_DIR / "x_train.csv", index=False)
x_val_raw.to_csv(SPLIT_DATA_DIR / "x_val.csv", index=False)
x_test_raw.to_csv(SPLIT_DATA_DIR / "x_test.csv", index=False)
y_train_raw.to_csv(SPLIT_DATA_DIR / "y_train.csv", index=False)
y_val_raw.to_csv(SPLIT_DATA_DIR / "y_val.csv", index=False)
y_test_raw.to_csv(SPLIT_DATA_DIR / "y_test.csv", index=False)

In [None]:
# Load them back up from the files
x_train_raw = pd.read_csv(SPLIT_DATA_DIR / "x_train.csv")
x_val_raw = pd.read_csv(SPLIT_DATA_DIR / "x_val.csv")
x_test_raw = pd.read_csv(SPLIT_DATA_DIR / "x_test.csv")
y_train_raw = pd.read_csv(SPLIT_DATA_DIR / "y_train.csv")
y_val_raw = pd.read_csv(SPLIT_DATA_DIR / "y_val.csv")
y_test_raw = pd.read_csv(SPLIT_DATA_DIR / "y_test.csv")

In [None]:
cat_features = [
    "HasKmLimit",
    "Gender",
    "MariStat",
    "VehUsage",
    "VehBody",
    "VehPrice",
    "VehEngine",
    "VehEnergy",
    "VehClass",
    "SocioCateg",
]

num_features = [
    feature for feature in x_train_raw.columns if feature not in cat_features
]

x_train, x_val, x_test, ct, all_categories = preprocess_data(
    x_train_raw,
    x_val_raw,
    x_test_raw,
    num_features=num_features,
    cat_features=cat_features,
    num_standard=True,
)

# Not doing anything extra to the target variable
y_train = y_train_raw
y_val = y_val_raw
y_test = y_test_raw

In [None]:
PROCESSED_DATA_DIR = Path("data/processed/real")
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

x_train.to_csv(PROCESSED_DATA_DIR / "x_train.csv", index=False)
x_val.to_csv(PROCESSED_DATA_DIR / "x_val.csv", index=False)
x_test.to_csv(PROCESSED_DATA_DIR / "x_test.csv", index=False)
y_train.to_csv(PROCESSED_DATA_DIR / "y_train.csv", index=False)
y_val.to_csv(PROCESSED_DATA_DIR / "y_val.csv", index=False)
y_test.to_csv(PROCESSED_DATA_DIR / "y_test.csv", index=False)

# Regularisation data

In [None]:
features, target, means, dispersion = generate_synthetic_gaussian(40000)
df = pd.concat([features, target], axis=1)
df.to_csv("data/raw/reg.csv", index=False)

In [None]:
x_train_raw, x_val_raw, x_test_raw, y_train_raw, y_val_raw, y_test_raw = split_data(
    features,
    target,
    seed=0,
)

In [None]:
SPLIT_DATA_DIR = Path("data/interim/reg")
SPLIT_DATA_DIR.mkdir(parents=True, exist_ok=True)

x_train_raw.to_csv(SPLIT_DATA_DIR / "x_train.csv", index=False)
x_val_raw.to_csv(SPLIT_DATA_DIR / "x_val.csv", index=False)
x_test_raw.to_csv(SPLIT_DATA_DIR / "x_test.csv", index=False)
y_train_raw.to_csv(SPLIT_DATA_DIR / "y_train.csv", index=False)
y_val_raw.to_csv(SPLIT_DATA_DIR / "y_val.csv", index=False)
y_test_raw.to_csv(SPLIT_DATA_DIR / "y_test.csv", index=False)

In [None]:
# Load them back up from the files
x_train_raw = pd.read_csv(SPLIT_DATA_DIR / "x_train.csv")
x_val_raw = pd.read_csv(SPLIT_DATA_DIR / "x_val.csv")
x_test_raw = pd.read_csv(SPLIT_DATA_DIR / "x_test.csv")
y_train_raw = pd.read_csv(SPLIT_DATA_DIR / "y_train.csv")
y_val_raw = pd.read_csv(SPLIT_DATA_DIR / "y_val.csv")
y_test_raw = pd.read_csv(SPLIT_DATA_DIR / "y_test.csv")

In [None]:
x_train, x_val, x_test, ct, all_categories = preprocess_data(
    x_train_raw,
    x_val_raw,
    x_test_raw,
    num_features=["X_1", "X_2"],
    cat_features=[],
    num_standard=False,
)

# Not doing anything extra to the target variable
y_train = y_train_raw
y_val = y_val_raw
y_test = y_test_raw

In [None]:
PROCESSED_DATA_DIR = Path("data/processed/reg")
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

x_train.to_csv(PROCESSED_DATA_DIR / "x_train.csv", index=False)
x_val.to_csv(PROCESSED_DATA_DIR / "x_val.csv", index=False)
x_test.to_csv(PROCESSED_DATA_DIR / "x_test.csv", index=False)
y_train.to_csv(PROCESSED_DATA_DIR / "y_train.csv", index=False)
y_val.to_csv(PROCESSED_DATA_DIR / "y_val.csv", index=False)
y_test.to_csv(PROCESSED_DATA_DIR / "y_test.csv", index=False)