# **Importación de Librerías**

In [None]:
%load_ext autoreload
%autoreload 2

import json
import numpy as np
import optuna
import os
import pprint
import time
import torch
torch.cuda.empty_cache()
import torch.nn as nn
import torch.optim as optim

from constants import *

from joblib import Parallel, delayed

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

from utils.data_preprocessor import DataPreprocessor
from utils.load_data import *
from utils.train_predict import train_step, validate_step, predict
from utils.metrics import *
from utils.optuna_utils import *
from utils.mlflow_logger import MLflowLogger
from utils.plots import *

from models.instantiate import instantiate_model
from models.lstm_v1 import *
from models.lstm_v2 import *
from models.gru import *
from models.dense import *
from models.conv import *
from models.lstm_conv import *

from tqdm.notebook import tqdm

np.random.seed(13)
torch.manual_seed(13)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
with open("config.json", "r") as f:
    config = json.load(f)

pprint.pp(config)

GROUP = "Grupo" + str(config['group'])
SIMULATION = "Simulacion" + str(config['simulation'])
REQ_PERIODS = config['required_periods']
METRICS = config['metrics']
DIRECTIONS = config['directions']
BETA = config['beta']
MODEL_ARCH = config['model_arch']
LOG_TO_MLFLOW = (config['log_to_mlflow'] == "True")
SCALE_DATA = (config['scale_data'] == "True")

TEMP_FEATS = [f'y(t-{i})' for i in range(REQ_PERIODS, 0, -1)]
STAT_FEATS = ['inicio_prog']
FEATS = STAT_FEATS + TEMP_FEATS

In [None]:
print(f"Log to MLflow: {LOG_TO_MLFLOW}")
mlflow_logger = MLflowLogger(
    LOG_TO_MLFLOW,
    TRACKING_SERVER_URI,
    EXPERIMENT_NAME + "-" + GROUP,
    EXPERIMENT_TAGS
)

In [None]:
GROUP_DIR = os.path.join(DATA_DIR, GROUP)
GROUP_PARAMS_FILE = os.path.join(GROUP_DIR, f"params_{GROUP}.json")
if os.path.exists(GROUP_PARAMS_FILE):
    with open(GROUP_PARAMS_FILE, 'r') as f:
        group_params = json.load(f)
        mlflow_logger.log_param("data_params", group_params)
else:
    print(f"Group params file not found: {GROUP_PARAMS_FILE}")

In [None]:
stata_filepath = os.path.join(GROUP_DIR, SIMULATION + ".dta")
if os.path.exists(stata_filepath):
    df = pd.read_stata(stata_filepath)
else:
    print(f"File {stata_filepath} not found.")

In [None]:
mlflow_logger.log_params({
    "group": GROUP,
    "simulation": SIMULATION,
    "filepath": stata_filepath,
    "required_periods": REQ_PERIODS,
    "scale_data": SCALE_DATA,
    "model_arch": MODEL_ARCH,
    "metrics": METRICS
})

if 'f_beta_score' in METRICS:
    mlflow_logger.log_param("beta", BETA)

# **Conjuntos de datos**

**Terminología**:
* Tipo 1: individuos tratados.
* Tipo 2: individuos de control (i.e. podrían haber sido tratados pero por alguna razón no lo fueron)
* Tipo 3: ni tratados ni de control.

##### **1. Separamos en tipo 1, tipo 2 y tipo 3.**

In [None]:
type1_df, type2_df, type3_df = get_dfs(df, REQ_PERIODS)

# Cantidad de inicios de programa. Esto nos dice cuántos duplicados hay de cada
# individuo de tipo 2 y tipo 3.
min_inicio_prog = type1_df['inicio_prog'].min()
max_inicio_prog = type1_df['inicio_prog'].max()
amount_inicio_prog = max_inicio_prog - min_inicio_prog + 1

#### **2. Separamos en train (que es sobre el que despues se va a hacer KFold) y test.**

In [None]:
type1_ids = type1_df.index.unique()
n_type1_train = 1000
type1_train = np.random.choice(type1_ids, n_type1_train, replace=False)
type1_train_df = type1_df.loc[type1_train]

type3_ids = type3_df.index.unique()
n_type3_train = 1000
type3_train = np.random.choice(type3_ids, n_type3_train, replace=False)
type3_train_df = type3_df.loc[type3_train]

# Los ids que no están en type3_train son para e
# l conjunto de testeo
n_type3_test = 2500
type3_test = list(set(type3_ids) - set(type3_train))
type3_test = np.random.choice(type3_test, n_type3_test, replace=False)
type3_test_df = type3_df.loc[type3_test]

for name, ids in [("ninis_ids_train", type3_train), ("ninis_ids_test", type3_test)]:
    mlflow_logger.log_json({name: ids.tolist(), "amount": len(ids)}, f"{name}.json")

In [None]:
train_df = pd.concat([type1_train_df, type3_train_df])
X_train_df, y_train_df = train_df[FEATS], train_df['target']

test_df = pd.concat([type2_df, type3_test_df])
X_test_df, y_test_df = test_df[FEATS], test_df['target']

weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train_df), y=y_train_df
)

In [None]:
train_proportions = y_train_df.value_counts(normalize=True).to_dict()
train_proportions = {k: round(v*100, 2) for k, v in train_proportions.items()}
print(f"Train proportions: {train_proportions}")

test_proportions  = y_test_df.value_counts(normalize=True).to_dict()
test_proportions = {k: round(v*100, 2) for k, v in test_proportions.items()}
print(f"Test proportions:  {test_proportions}")

mlflow_logger.log_params({
    "train_proportions": train_proportions,
    "test_proportions": test_proportions
})

#### **3.Estandarizamos ambos conjuntos en base a los datos de entrenamiento.**

Las columnas de nuestro dataset corresponden a distintos pasos de tiempo para 
cada individuo. Si estandarizamos a lo largo de cada columna, perdemos la relación
que hay entre los valores de una misma serie de tiempo, que es lo que nos interesa
en este caso (que la red identifique la tendencia creciente).

Lo que vamos a probar es escalar a lo largo de las filas.

In [None]:
data_preprocessor = DataPreprocessor(STAT_FEATS, TEMP_FEATS)

if SCALE_DATA:
    X_train_df_scaled, X_test_df_scaled = data_preprocessor._scale_data(X_train_df, X_test_df)
    # X_train_temp_flattened = X_train_df[TEMP_FEATS].values.flatten()

    # X_train_temp_mean = np.mean(X_train_temp_flattened)
    # X_train_temp_std  = np.std(X_train_temp_flattened)

    # X_train_df_scaled[TEMP_FEATS] = (X_train_df[TEMP_FEATS] - X_train_temp_mean) / X_train_temp_std
    # X_test_df_scaled[TEMP_FEATS]  = (X_test_df[TEMP_FEATS] - X_train_temp_mean) / X_train_temp_std
else:
    X_train_df_scaled, X_test_df_scaled = X_train_df, X_test_df

In [None]:
treated_df_scaled = X_train_df_scaled[y_train_df==1]
control_df_scaled = X_test_df_scaled[y_test_df==1]
nini_df_scaled = X_train_df_scaled[y_train_df==0]

for df, label in [
    (treated_df_scaled, "Tratados"), (control_df_scaled, "de Control"), (nini_df_scaled, "Ninis")
]:
    fig, ax = plot_time_series(df[TEMP_FEATS], 15, label)
    # fig.show()
    mlflow_logger.log_plot(fig, f"plot_time_series_{label}.png")

#### **4. Logueamos los datasets finales a MLFlow.**

In [None]:
# We build the datasets with the target column to load them into mlflow
train_df = X_train_df_scaled.copy()
train_df['target'] = y_train_df
mlflow_logger.log_input(train_df, "train")

test_df = X_test_df_scaled.copy()
test_df['target'] = y_test_df
mlflow_logger.log_input(test_df, "test")

#### **5. Obtenemos estructura necesaria según la red que querramos usar y construimos Datasets**

In [None]:
train_set, test_set = data_preprocessor.build_datasets(
    X_train_df_scaled, X_test_df_scaled, y_train_df, y_test_df, MODEL_ARCH
)

# **Búsqueda de hiperparámetros con Optuna**

#### **6. Dejamos seleccionado el constructor de modelo para la búsqueda de hiperparámetros**

In [None]:
match MODEL_ARCH:
    case "lstm_v1":
        define_model = define_lstm_v1_model
        input_size = 2
    case "lstm_v2":
        define_model = define_lstm_v2_model
        input_size = 1
    case "gru":
        define_model = define_gru_model
        input_size = 2
    case "dense":
        define_model = define_dense_model
        input_size = len(FEATS)
    case "conv":
        define_model = define_conv_model
        input_size = 1
    case "lstm_conv":
        define_model = define_lstm_conv_model
        input_size = 1
print(f"Selected model: {MODEL_ARCH}, Input size: {input_size}")

#### **7. Definimos los parámetros de la búsqueda de hiperparámetros**

In [None]:
timestamp = time.strftime("%d%m%Y-%H%M%S")
study_name = f"study_{timestamp}"

if len(METRICS) != len(DIRECTIONS):
    raise ValueError("The number of metrics and directions should be the same")

optuna_params = {
    "optuna_study_name": study_name,
    "objective_metrics": METRICS,
    "directions": DIRECTIONS
}

if "f_beta_score" in METRICS:
    optuna_params["beta"] = BETA

In [None]:
study = optuna.create_study(
    direction=DIRECTIONS[0] if len(METRICS) == 1 else None,
    directions=DIRECTIONS if len(METRICS) > 1 else None,
    storage=OPTUNA_STORAGE,
    study_name=study_name,
    load_if_exists=True
)
study.set_metric_names(METRICS)

In [None]:
def run_worker(
    n_trials, study_name, optuna_storage, define_model, input_size,
    train_set, loss_fn, metrics, beta
):
    study = optuna.load_study(
        study_name=study_name,
        storage=optuna_storage
    )
    study.optimize(
        lambda trial: objective_cv(
            trial, define_model, input_size, train_set, loss_fn, metrics, beta=beta
        ),
        n_trials=n_trials,
        n_jobs=-1,
    )

In [None]:
loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(weights[1], dtype=torch.float32))
print(f"Starting hyperparameter optimization with {N_PROCESSES} processes")
Parallel(n_jobs=N_PROCESSES)(
    delayed(run_worker)(
        TRIALS_PER_PROCESS, study_name, OPTUNA_STORAGE, define_model, input_size, train_set,
        loss_fn, METRICS, BETA
    )
    for _ in range(N_PROCESSES)
)

In [None]:
if len(METRICS) > 1:
    fig = pareto_front(study, METRICS, DIRECTIONS)

In [None]:
best_trials_info = get_best_trials_info(study, METRICS)
best_trials_numbers = [trial['trial_number'] for trial in best_trials_info]
pprint.pp(f"Best trials info: {best_trials_info}")
pprint.pp(f"Best trials numbers: {best_trials_numbers}")

mlflow_logger.log_json(best_trials_info, "best_trials_info.json")
optuna_params["best_trials_numbers"] = best_trials_numbers

In [None]:
if len(best_trials_numbers) == 1:
    selected_trial = best_trials_numbers[0]
else:
    # Desempatamos por hidden_size
    min_hidden_size = min(best_trials_info, key=lambda x: x['params']['hidden_size'])
    selected_trial = min_hidden_size['trial_number']

optuna_params["selected_trial_number"] = selected_trial
optuna_params["metric_best_value"] = study.trials[selected_trial].value
mlflow_logger.log_param("optuna_params", optuna_params)

mlflow_logger.log_param("metric_best_value_in_optimization", optuna_params["metric_best_value"])

# **Entrenamiento del modelo con mejores hiperparámetros**

In [None]:
params = study.trials[selected_trial].params
params['num_layers'] = N_LAYERS
params['n_epochs'] = N_EPOCHS
params['optimizer'] = OPTIMIZER
params['model_name'] = MODEL_ARCH
mlflow_logger.log_param("train_params", params)

model = instantiate_model(MODEL_ARCH, input_size, params).to(device)
mlflow_logger.log_model_architecture(model)

# optimizer_name and lr parameters are for specifying the optimizer
lr = params['lr']
optimizer = getattr(optim, OPTIMIZER)(model.parameters(), lr=lr)

# batch_size is for the training loop
batch_size = params['batch_size']
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True , num_workers=8)
test_loader  = DataLoader(test_set , batch_size=batch_size, shuffle=False, num_workers=8)

# loss function
loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(weights[1], dtype=torch.float32))

In [None]:
epoch_losses_train = []
epoch_losses_test  = []
epoch_metrics_test = {metric: [] for metric in METRICS}

print("Training model with best hyperparameters")
for epoch in tqdm(range(N_EPOCHS)):
    print(f"Epoch {epoch} -----------------------------------------------------")
    # This value is just for monitoring
    avg_loss_trainining = train_step(model, train_loader, loss_fn, optimizer)
    print(f"  Average loss in train set during epoch: {avg_loss_trainining:.6f}")

    avg_loss_train, _ = validate_step(
        model, train_loader, loss_fn, METRICS, beta=BETA
    )
    print(f"  Average loss in train set: {avg_loss_train:.6f}")
    epoch_losses_train.append(avg_loss_train)

    avg_loss_test, metrics_test = validate_step(
        model, test_loader, loss_fn, METRICS, beta=BETA
    )
    print(f"  Average loss in test set: {avg_loss_test:.6f}")
    epoch_losses_test.append(avg_loss_test)
    print(f"  Metrics in test set:")
    for metric, value in metrics_test.items():
        print(f"    - {metric}: {value:.6f}")
        epoch_metrics_test[metric].append(value)
print("Training finished")

In [None]:
for metric in METRICS:
    mlflow_logger.log_param(f"{metric}_test_after_training", epoch_metrics_test[metric][-1])

In [None]:
*input_batch, _ = next(iter(train_loader))

if len(input_batch) == 1:
    example_input = input_batch[0].cpu().numpy()
else:
    example_input = {
        "temporal": input_batch[0].cpu().numpy(),
        "static": input_batch[1].cpu().numpy()
    }

mlflow_logger.log_model(model.to('cpu'), "trained_model", example_input)

In [None]:
fig, ax = epoch_vs_loss_plot(epoch_losses_train, epoch_losses_test)
fig.show()

mlflow_logger.log_plot(fig, "epoch_vs_loss.png")

In [None]:
for metric in METRICS:
    if metric == 'f_beta_score':
        metric_name = f"Puntaje $F_{{{BETA}}}$"
    elif metric == 'f1_score':
        metric_name = "Puntaje $F_1$"
    else:
        metric_name = metric.capitalize()
    fig, ax = epoch_vs_metric_plot(metric_name, epoch_metrics_test[metric])
    fig.show()

    mlflow_logger.log_plot(fig, f"epoch_vs_{metric}.png")

# **Evaluación del modelo entrenado**

In [None]:
model.eval()
model.to('cpu')

try:
    X_test_tensor, y_test_tensor = test_set.tensors
    y_test_pred = model(X_test_tensor.to('cpu'))
except AttributeError:
    X_test_temporal_tensor, X_test_static_tensor, y_test_tensor = (
        test_set.temporal_data, test_set.static_data, test_set.labels
    )
    y_test_pred = model(X_test_temporal_tensor.to('cpu'), X_test_static_tensor.to('cpu'))

y_test_pred = predict(y_test_pred, loss_fn).squeeze()

In [None]:
fig, ax = confusion_matrix_plot(y_test_tensor, y_test_pred)
fig.show()

mlflow_logger.log_plot(fig, "confusion_matrix_plot.png")

In [None]:
report_str = classification_report(y_test_tensor, y_test_pred)
print(report_str)

mlflow_logger.log_json(
    classification_report(y_test_tensor, y_test_pred, output_dict=True),
    "classification_report.json"
)

In [None]:
fig, ax = roc_curve_plot(y_test_tensor, y_test_pred)
fig.show()

mlflow_logger.log_plot(fig, "roc_curve_plot.png")

In [None]:
mlflow_logger.end_run()