# **Importación de Librerías**

In [None]:
%load_ext autoreload
%autoreload 2

import json
import mlflow
import mlflow.exceptions
import optuna
import os
import pprint
import time
import torch
import torch.nn as nn
import torch.optim as optim

from constants import *

from torch.utils.data import TensorDataset

from optuna.visualization import plot_pareto_front, plot_intermediate_values

from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler

from utils.load_data import *
from utils.train_predict import train_validate_loop, predict
from utils.early_stopping import EarlyStopping
from utils.metrics import *
from utils.optuna_utils import *
from utils.mlflow_utils import *

from models.lstm import *

from tqdm.notebook import tqdm

torch.manual_seed(13)

In [None]:
mlflow.set_tracking_uri(TRACKING_SERVER_URI)
try:
    mlflow.create_experiment(name=EXPERIMENT_NAME, tags=EXPERIMENT_TAGS)
except mlflow.exceptions.RestException as e:
    print(e)
mlflow.set_experiment(EXPERIMENT_NAME)
run = set_active_run()

In [3]:
with open("config.json", "r") as f:
    config = json.load(f)

group = config['group']
simulation = config['simulation']
required_periods = config['required_periods']
partitions = config['partitions']

stata_filepath = os.path.join(DATA_DIR, group, simulation)

In [4]:
mlflow.log_params({
    "group": group,
    "simulation": simulation,
    "filepath": stata_filepath,
    "required_periods": required_periods,
    "partitions": partitions
})

**Terminología**:
* Tipo 1: individuos tratados.
* Tipo 2: individuos de control (i.e. podrían haber sido tratados pero por alguna razón no lo fueron)
* Tipo 3: ni tratados ni de control.

In [5]:
value_columns = ['inicio_prog'] + [f'y(t-{i})' for i in range(required_periods, 0, -1)]
type1_df, type2_df, type3_df = get_dfs(stata_filepath, required_periods)

# **Conjuntos de datos**

In [None]:
X_train_df, y_train_df, X_valid_df, y_valid_df, X_test_df, y_test_df = build_train_valid_test_dfs(
    type1_df, type2_df, type3_df, partitions
)

# Class balancing
weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train_df), y=y_train_df
)
print(f"Peso para la clase 0 (mayoritaria): {weights[0]:.4f}")
print(f"Peso para la clase 1 (minoritaria): {weights[1]:.4f}")

In [7]:
# Stanadarize with training data
scaler =  StandardScaler().fit(X_train_df[value_columns])

X_train_df[value_columns] = scaler.transform(X_train_df[value_columns])
X_valid_df[value_columns] = scaler.transform(X_valid_df[value_columns])
X_test_df [value_columns] = scaler.transform(X_test_df [value_columns])

In [None]:
# We build the datasets with the target column to load them into mlflow
train_df = X_train_df.copy()
train_df['target'] = y_train_df
mlflow.log_input(
    mlflow.data.from_pandas(train_df, targets='target'),
    context="train",
)

valid_df = X_valid_df.copy()
valid_df['target'] = y_valid_df
mlflow.log_input(
    mlflow.data.from_pandas(valid_df, targets='target'),
    context="validation",
)

test_df = X_test_df.copy()
test_df['target'] = y_test_df
mlflow.log_input(
    mlflow.data.from_pandas(test_df, targets='target'),
    context="test",
)

# **LSTM**

Basados en el documento `docs/lstm.md`, transformemos los datos que tenemos a la forma que necesitan las LSTM:
`(batch_size, sequence_length, num_features)`.

Cada fila es un individuo y en cada fila ya tenemos todo lo que necesitamos, los 4 datos temporales (`y(t-4)`, `y(t-3)`, `y(t-2)`, `y(t-1)`) y además el dato estático (`inicio_prog`) que lo vamos a tener que repetir cuatro veces para tener la dimensión que deseamos.

In [10]:
time_steps = required_periods

# The 2 features are:
#  - inicio_prog (static, reamins the same for all time steps)
#  - y (dynamic, changes for each time step)
num_features = 2

In [11]:
X_train_tensor = get_lstm_input(X_train_df, time_steps, num_features)
X_valid_tensor = get_lstm_input(X_valid_df, time_steps, num_features)
X_test_tensor  = get_lstm_input(X_test_df , time_steps, num_features)

# When using BCEWithLogitsLoss as loss function, the targets should be casted to
# float
y_train_tensor = torch.tensor(y_train_df.values, dtype=torch.float)
y_valid_tensor = torch.tensor(y_valid_df.values, dtype=torch.float)
y_test_tensor  = torch.tensor(y_test_df.values , dtype=torch.float)

train_set = TensorDataset(X_train_tensor, y_train_tensor)
valid_set = TensorDataset(X_valid_tensor, y_valid_tensor)
test_set  = TensorDataset(X_test_tensor , y_test_tensor)

## **Búsqueda de hiperparámetros con Optuna**

In [12]:
timestamp = time.strftime("%Y%m%d-%H%M%S")
study_name = f"study_{timestamp}"
study_n_trials = 50

metrics = config['metrics']
directions = config['directions']
beta = config['beta']

if len(metrics) != len(directions):
    raise ValueError("The number of metrics and directions should be the same")

mlflow.log_params({
    "study_name": study_name,
    "study_n_trials": study_n_trials,
    "objective_metrics": metrics,
    "directions": directions
})

if "f_beta_score" in metrics:
    mlflow.log_param("beta", beta)

In [None]:
study = optuna.create_study(
    directon=directions[0] if len(metrics) == 1 else None,
    directions=directions if len(metrics) > 1 else None,
    storage=OPTUNA_STORAGE,
    study_name=study_name
)
study.set_metric_names(metrics)
study.optimize(
    lambda trial: objective(trial, train_set, valid_set, weights, metrics, beta=beta),
    n_trials=study_n_trials,
    timeout=600,
    n_jobs=-1,
    show_progress_bar=True
)

In [None]:
if len(metrics) > 1:
    fig = plot_pareto_front(study, target_names=metrics)
    log_plot(fig, "pareto_front_plot.png")
    fig.show()
else:
    fig = plot_intermediate_values(study)
    log_plot(fig, "intermediate_values_plot.png")
    fig.show()

In [None]:
print(f"Number of trials on the Pareto front: {len(study.best_trials)}")

for i, (metric, direction) in enumerate(zip(metrics, directions)):
    if direction == 'maximize':
        best_trial = max(study.best_trials, key=lambda t: t.values[i])
    elif direction == 'minimize':
        best_trial = min(study.best_trials, key=lambda t: t.values[i])
    
    print(f"Metric: {metric}")
    print(f"\tDirection: {direction}")
    print(f"\tTrial number: {best_trial.number}")
    print(f"\tValues: {best_trial.values}")
    print(f"\tParams: {best_trial.params}")

In [19]:
best_trials_info = get_best_trials_info(study, metrics)
log_json(best_trials_info, "best_trials_info.json")

In [20]:
best_trials_numbers = [trial['trial_number'] for trial in best_trials_info]
mlflow.log_params({
    "best_trials_numbers": best_trials_numbers
})

## **Entrenamiento del modelo con mejores hiperparámetros**

In [None]:
chosen_trial = 14
params = study.trials[chosen_trial].params

hidden_size = params['hidden_size']
n_layers = params['n_layers']
lr = params['lr']
batch_size = params['batch_size']
optimizer_name = params['optimizer']
epochs = params['n_epochs']

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = LSTMClassifier(num_features, hidden_size, 1, n_layers).to(device)

optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=4)

X_test_tensor = test_set.tensors[0].to(device)

if 'avg_feats_diff' in metrics:
    train_features_mean = get_features_mean(X_train_tensor, y_train_tensor).to(device)

loss_fn = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(weights[1], dtype=torch.float32)
)

for epoch in tqdm(range(epochs)):
    print(f"Epoch {epoch} -----------------------------------------------------")
    train_step(model, train_loader, loss_fn, optimizer)
    
    logits = model(X_test_tensor)
    y_test_pred = predict(logits, loss_fn).squeeze()

    metrics_kwargs = {}
    if 'avg_feats_diff' in metrics:
        metrics_kwargs['X_valid'] = X_test_tensor
        metrics_kwargs['train_features_mean'] = train_features_mean
    if 'f_beta_score' in metrics:
        metrics_kwargs['beta'] = beta
    metrics_values = compute_metrics(metrics, y_test_tensor, y_test_pred, **metrics_kwargs)
    pprint.pp(metrics_values)

    mlflow.log_metrics(metrics_values, step=epoch)

In [22]:
model.to('cpu')

y_test_pred = model(X_test_tensor.to('cpu'))
y_test_pred = predict(y_test_pred, loss_fn).squeeze()

In [None]:
fig = confusion_matrix_plot(y_test_tensor, y_test_pred)
log_plot(fig, "confusion_matrix_plot.png")
fig.show()