# Add peer-based features using learned embeddings

This notebook will:

1. Load a dataset and train a neural network to learn embeddings for categorical features.
2. Extract the embeddings
3. Construct peer-based features using the learned embeddings.
4. Export the new features for use in other models.

In [None]:
import json
import pandas as pd
from pathlib import Path
from src.ml_backend import load_processed
from src.ml_backend import NeuralNetworkWithEmbeddings
from src.ml_backend import split_data

## Preparation

- load preprocessed data
- load data split configs

In [None]:
# Load configurations for data splitting, this would be the input for the `split_data` function
with open(Path("configs.json"), "r") as f:
    configs = json.load(f)["train_iteration"]

train_start: int = configs["train_start"]
val_start: int = configs["val_start"]
val_years: int = configs["val_years"]
test_years: int = configs["test_years"]
test_end: int = configs["test_end"]

# load processed data
preprocessed_data = load_processed(reprocesse=False)  # change to True if reprocessing is needed
    

In [None]:
# check all the datatype are not float64
for col in preprocessed_data.columns:
    if preprocessed_data[col].dtype != "float64":
        print(f"Column {col} has datatype {preprocessed_data[col].dtype}")

## Train neural network with embeddings

The models will be defined in `src/ml_backend/models.py` as `NeuralNetworkWithEmbeddings`.

All models defined in this project have a consistent interface for training and prediction.

- `model.auto_tune()` will auto-tune hyperparameters (defined in `configs.json`
- `model.train_final()` will train the model on training data + validation data
- `model.predict()` will generate predictions on test data

In [None]:
# loop for training iterations
predictions_list = []
while True:
    train_df, val_df, test_df = split_data(
        df = preprocessed_data,
        train_start=train_start,
        val_start=val_start,
        val_years=val_years,
        test_years=test_years
    )
    
    model = NeuralNetworkWithEmbeddings(
        train_df=train_df,
        val_df=val_df,
        test_df=test_df
    )

    print(f"{'='*20} New Iteration {'='*20}")
    print(f"start auto-tuning for test period {val_start +val_years} to {val_start + val_years + test_years - 1}")
    model.auto_tune()
    print(f"start final training for test period {val_start +val_years} to {val_start + val_years + test_years - 1}")
    model.train_final()
    print(f"start prediction for test period {val_start +val_years} to {val_start + val_years + test_years - 1}")
    predictions = model.predict()
    predictions_list.append(predictions)
    # get peer-based features and save to csv
    
    # Example: print first two columns of predictions
    all_new_features = []
    for col in train_df.columns[:2]:
        new_feature_name = f"peer_based_{col}"
        new_feature: pd.Series = model.distance_weighted_feature(col)
        all_new_features.append(new_feature.rename(new_feature_name))
    new_features_df = pd.concat(all_new_features, axis=1)
    new_features_df.to_csv(Path(f"data/test/peer_based_features_{val_start + val_years + test_years - 1}.csv"), index=True)

    print(f"Completed predictions for test period {val_start +val_years} to {val_start + val_years + test_years - 1}")
    # Update for next iteration
    val_start += test_years
    if val_start + val_years + test_years - 1 > test_end:
        break

# Combine all predictions
all_predictions = pd.concat(predictions_list, ignore_index=True)
all_predictions.to_csv(Path("data/test/predictions_with_embeddings.csv"), index=False)
