# **Importación de Librerías**

In [None]:
%load_ext autoreload
%autoreload 2

import json
import numpy as np
import mlflow
import mlflow.exceptions
import optuna
import os
import pprint
import time
import torch
import torch.nn as nn
import torch.optim as optim

from constants import *

from imblearn.over_sampling import SMOTE

from torch.utils.data import TensorDataset

from optuna.visualization import plot_pareto_front, plot_intermediate_values
from optuna.study import MaxTrialsCallback
from optuna.trial import TrialState

from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler

from utils.data_classes import TemporalStaticDataset
from utils.load_data import *
from utils.train_predict import train_step, validate_step_with_metrics, predict
from utils.metrics import *
from utils.optuna_utils import *
from utils.mlflow_utils import *

from models.lstm_v1 import *
from models.lstm_v2 import *
from models.gru import *
from models.dense import *
from models.fcn import *

from tqdm.notebook import tqdm

np.random.seed(13)
torch.manual_seed(13)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
mlflow.set_tracking_uri(TRACKING_SERVER_URI)
try:
    mlflow.create_experiment(name=EXPERIMENT_NAME, tags=EXPERIMENT_TAGS)
except mlflow.exceptions.RestException as e:
    print(e)
mlflow.set_experiment(EXPERIMENT_NAME)
run = set_active_run()

In [None]:
with open("config.json", "r") as f:
    config = json.load(f)

# PARTITIONS = config['partitions']
GROUP = config['group']
SIMULATION = config['simulation']
REQ_PERIODS = config['required_periods']
METRICS = config['metrics']
DIRECTIONS = config['directions']
BETA = config['beta']
MODEL_ARCH = config['model_arch']

TEMP_FEATS = [f'y(t-{i})' for i in range(REQ_PERIODS, 0, -1)]
STAT_FEATS = ['inicio_prog']
FEATS = STAT_FEATS + TEMP_FEATS

stata_filepath = os.path.join(DATA_DIR, GROUP, SIMULATION)
df = pd.read_stata(stata_filepath)

In [None]:
mlflow.log_params({
    "group": GROUP,
    "simulation": SIMULATION,
    "filepath": stata_filepath,
    "required_periods": REQ_PERIODS
    # "partitions": PARTITIONS
})

# **Conjuntos de datos**

**Terminología**:
* Tipo 1: individuos tratados.
* Tipo 2: individuos de control (i.e. podrían haber sido tratados pero por alguna razón no lo fueron)
* Tipo 3: ni tratados ni de control.

##### **1. Separamos en tipo 1, tipo 2 y tipo 3.**

In [None]:
type1_df, type2_df, type3_df = get_dfs(df, REQ_PERIODS)

# Chequeo para asegurarnos que los que tienen target 1 de tipo 2 se corresponden
# con el inicio_prog de la base original
assert(
    all(
        type2_df[type2_df['target'] == 1]['inicio_prog'] == 
        df[df['tipo'] == 2].groupby('id')['inicio_prog'].first()
    )
)

# Cantidad de inicios de programa. Esto nos dice cuántos duplicados hay de cada
# individuo de tipo 2y tipo 3.
min_inicio_prog = type1_df['inicio_prog'].min()
max_inicio_prog = type1_df['inicio_prog'].max()
amount_inicio_prog = max_inicio_prog - min_inicio_prog + 1

#### **2. Separamos en train (que es sobre el que despues se va a hacer KFold) y test.**

In [None]:
# Cantidad de individuos de cada tipo (con los duplicados)
n1 = len(type1_df)
n2 = len(type2_df)
n3 = len(type3_df)

# Cantidad total de individuos en el dataset POST-TRANSFORMACION. Estas son las
# muestras que se van a usar en el proceso de entrenamiento y testing, por eso
# se tienen en cuenta los duplicados.
n = n1 + n2 + n3

# Tamaño de los conjuntos de entrenamiento y testeo
train_size = int(0.7 * n)
test_size = n - train_size

# En type2_df y type3_df, tenemos el mismo individuo pero con distinto inicio_prog
# y por lo tanto distintos valores de y(t-1), y(t-2), ..., y(t-required_periods)
# Lo que queremos es que todas las "copias" de un mismo individuo estén o todas
# en train o todas en test. Por lo tanto, la selección no va a ser sobre filas
# del dataframe sino sobre ids de individuos.
type3_ids = type3_df.index.unique()

# Todos los individuos de tipo 1 van al conjunto de entrenamiento, y los restantes
# son de tipo 3 elegidos al azar
# Dividimos por amount_inicio_prog porque vamos a seleccionar ids de individuos
# de tipo 3, no filas
# type3_train = int((train_size - n1) / amount_inicio_prog)
n_type3_train = 1000

# Ahora sí, seleccionamos aleatoriamente n3_train ids de individuos de tipo 3
# para el conjunto de entrenamiento
type3_train = np.random.choice(type3_ids, n_type3_train, replace=False)

# Y nos quedamos con las filas del dataframe de tipo 3 que tienen los ids seleccionados
type3_train_df = type3_df.loc[type3_train]

# Los ids que no están en type3_train son para el conjunto de testeo
n_type3_test = 500
type3_test = list(set(type3_ids) - set(type3_train))
type3_test = np.random.choice(type3_test, n_type3_test, replace=False)
type3_test_df = type3_df.loc[type3_test]

In [None]:
train_df = pd.concat([type1_df, type3_train_df])
X_train_df, y_train_df = train_df[FEATS], train_df['target']
smote = SMOTE(sampling_strategy=0.6, k_neighbors=5, random_state=13)
X_train_df, y_train_df = smote.fit_resample(X_train_df, y_train_df)

test_df = pd.concat([type2_df, type3_test_df])
X_test_df, y_test_df = test_df[FEATS], test_df['target']

weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train_df), y=y_train_df
)

In [None]:
y_train_df.value_counts() / len(y_train_df) * 100

In [None]:
y_test_df.value_counts() / len(y_test_df) * 100

#### **3.Estandarizamos ambos conjuntos en base a los datos de entrenamiento.**

In [None]:
scaler = StandardScaler().fit(X_train_df)

X_train_scaled = scaler.transform(X_train_df)
X_train_df_scaled = pd.DataFrame(
    X_train_scaled, columns=X_train_df.columns, index=X_train_df.index
)

X_test_scaled = scaler.transform(X_test_df)
X_test_df_scaled = pd.DataFrame(
    X_test_scaled, columns=X_test_df.columns, index=X_test_df.index
)

In [None]:
X_train_df_scaled.head()

#### **4. Logueamos los datasets finales a MLFlow.**

In [None]:
# We build the datasets with the target column to load them into mlflow
train_df = X_train_df_scaled.copy()
train_df['target'] = y_train_df
mlflow.log_input(
    mlflow.data.from_pandas(train_df, targets='target'),
    context="train",
)

test_df = X_test_df_scaled.copy()
test_df['target'] = y_test_df
mlflow.log_input(
    mlflow.data.from_pandas(test_df, targets='target'),
    context="test",
)

#### **5. Obtenemos estructura necesaria según la red que querramos usar y construimos Datasets**

Basados en el documento `docs/lstm.md`, transformemos los datos que tenemos a la forma que necesitan las LSTM: `(batch_size, sequence_length, num_features)`.

Cada fila es un individuo y en cada fila ya tenemos todo lo que necesitamos, los 4 datos temporales (`y(t-4)`, `y(t-3)`, `y(t-2)`, `y(t-1)`) y además el dato estático (`inicio_prog`) que lo vamos a tener que repetir cuatro veces para tener la dimensión que deseamos.

In [None]:
# labels should be of type float32 if using BCEWithLogitsLoss
# labels should be of type long if using CrossEntropyLoss
y_train_tensor = torch.tensor(y_train_df.values, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test_df.values , dtype=torch.float32)

if any(keyword in MODEL_ARCH for keyword in ['rnn', 'gru', 'lstm_v1']):
    num_features = 2
    X_train_tensor = get_lstm_input(X_train_df_scaled, TEMP_FEATS, STAT_FEATS)
    X_test_tensor  = get_lstm_input(X_test_df_scaled , TEMP_FEATS, STAT_FEATS)
    train_set = TensorDataset(X_train_tensor, y_train_tensor)
    test_set  = TensorDataset(X_test_tensor , y_test_tensor)

elif 'dense' in MODEL_ARCH:
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    X_test_tensor  = torch.tensor(X_test_scaled, dtype=torch.float32)
    train_set = TensorDataset(X_train_tensor, y_train_tensor)
    test_set  = TensorDataset(X_test_tensor , y_test_tensor)

elif any(keyword in MODEL_ARCH for keyword in ['lstm_v2', 'fcn']):
    X_train_temp_tensor = get_lstm_input(X_train_df_scaled, TEMP_FEATS)
    X_test_temp_tensor  = get_lstm_input(X_test_df_scaled , TEMP_FEATS)
    X_train_static_tensor = torch.tensor(
        X_train_df['inicio_prog'].values, dtype=torch.float32
    ).view(-1, 1)
    X_test_static_tensor  = torch.tensor(
        X_test_df['inicio_prog'].values , dtype=torch.float32
    ).view(-1, 1)
    train_set = TemporalStaticDataset(
        X_train_temp_tensor, X_train_static_tensor, y_train_tensor
    )
    test_set = TemporalStaticDataset(
        X_test_temp_tensor, X_test_static_tensor, y_test_tensor
    )

## **Búsqueda de hiperparámetros con Optuna**

#### **6. Dejamos seleccionado el constructor de modelo para la búsqueda de hiperparámetros**

In [None]:
match MODEL_ARCH:
    case "lstm_v1":
        define_model = define_lstm_v1_model
    case "lstm_v2":
        define_model = define_lstm_v2_model
    case "gru":
        define_model = define_gru_model
    case "dense":
        define_model = define_dense_model
    case "fcn":
        define_model = define_fcn_model

#### **7. Definimos los parámetros de la búsqueda de hiperparámetros**

In [None]:
timestamp = time.strftime("%Y%m%d-%H%M%S")
study_name = f"study_{timestamp}"
study_n_trials = 30

if len(METRICS) != len(DIRECTIONS):
    raise ValueError("The number of metrics and directions should be the same")

mlflow.log_params({
    "optuna_study_name": study_name,
    "optuna_study_n_trials": study_n_trials,
    "objective_metrics": METRICS,
    "directions": DIRECTIONS
})

if "f_beta_score" in METRICS:
    mlflow.log_param("f_beta_score", BETA)

In [None]:
study = optuna.create_study(
    direction=DIRECTIONS[0] if len(METRICS) == 1 else None,
    directions=DIRECTIONS if len(METRICS) > 1 else None,
    storage=OPTUNA_STORAGE,
    study_name=study_name
)
study.set_metric_names(METRICS)

study.optimize(
    lambda trial: objective_cv(
        trial, define_model, train_set, weights, METRICS, beta=BETA
    ),
    n_trials=20,
    timeout=600,
    n_jobs=-1,
    callbacks=[MaxTrialsCallback(study_n_trials, states=(TrialState.COMPLETE,))],
    show_progress_bar=True
)

In [None]:
if len(METRICS) > 1:
    print(f"Number of trials on the Pareto front: {len(study.best_trials)}")
    for i, (metric, direction) in enumerate(zip(METRICS, DIRECTIONS)):
        if direction == 'maximize':
            best_trial = max(study.best_trials, key=lambda t: t.values[i])
        elif direction == 'minimize':
            best_trial = min(study.best_trials, key=lambda t: t.values[i])
        print(f"Metric: {metric}")
        print(f"\tDirection: {direction}")
        print(f"\tTrial number: {best_trial.number}")
        print(f"\tValues: {best_trial.values}")
        print(f"\tParams: {best_trial.params}")
    
    fig = plot_pareto_front(study, target_names=METRICS)
    log_plot(fig, "pareto_front_plot.png")
    fig.show()
# else:
#     fig = plot_intermediate_values(study)
#     log_plot(fig, "intermediate_values_plot.png")
#     fig.show()

In [None]:
best_trials_info = get_best_trials_info(study, METRICS)
best_trials_numbers = [trial['trial_number'] for trial in best_trials_info]

log_json(best_trials_info, "best_trials_info.json")
mlflow.log_params({
    "best_trials_numbers": best_trials_numbers
})

In [None]:
best_trials_info

In [None]:
while True:
    try:
        selected_trial = int(input(
            f"Enter the trial number. Choose from {best_trials_numbers}: "
        ))
        if selected_trial in best_trials_numbers:
            print(f"You selected Trial Number: {selected_trial}")
            break
        else:
            print(f"Invalid input. Please select a number from {best_trials_numbers}.")
    except ValueError:
        print("Invalid input. Please enter a valid number.")

mlflow.log_param("selected_trial_number", selected_trial, )

## **Entrenamiento del modelo con mejores hiperparámetros**

In [None]:
params = study.trials[selected_trial].params

# Common parameters for all models
epochs = params['n_epochs']
dropout = params['dropout']

# Model specific parameters
if any(keyword in MODEL_ARCH for keyword in ['rnn', 'gru', 'lstm_v1']):
    hidden_size = params['hidden_size']
    n_layers = params['n_layers']
    model = LSTMClassifier_v1(
        input_size=num_features,
        hidden_size=hidden_size,
        output_size=1,
        num_layers=n_layers,
        dropout=dropout
    )

elif MODEL_ARCH == "dense":
    n_layers = params['n_layers']
    hidden_sizes = [params[f"n_units_l{i}"] for i in range(n_layers)]
    input_size = len(FEATS)
    model = DenseClassifier(input_size, hidden_sizes, dropout)

elif MODEL_ARCH == "lstm_v2":
    pass

log_model_architecture(model)

# optimizer_name and lr parameters are for specifying the optimizer
optimizer_name = params['optimizer']
lr = params['lr']
optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

# batch_size is for the training loop
batch_size = params['batch_size']
train_loader = DataLoader(
    train_set, batch_size=batch_size, shuffle=True, num_workers=4
)

loss_fn = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(weights[1], dtype=torch.float32)
)

In [None]:
for epoch in tqdm(range(epochs)):
    print(f"Epoch {epoch} -----------------------------------------------------")
    train_step(model, train_loader, loss_fn, optimizer)

    metrics_values = validate_step_with_metrics(
        model,
        X_test_tensor,
        y_test_tensor,
        loss_fn,
        METRICS,
        beta=BETA,
        train_features_mean=None
    )
    pprint.pp(metrics_values)
    mlflow.log_metrics(metrics_values, step=epoch)

mlflow.pytorch.log_model(model, "trained_model")

In [None]:
model.to('cpu')
y_test_pred = model(X_test_tensor.to('cpu'))
y_test_pred = predict(y_test_pred, loss_fn).squeeze()

In [None]:
fig = confusion_matrix_plot(y_test_tensor, y_test_pred)
log_plot(fig, "confusion_matrix_plot.png")
fig.show()

In [None]:
roc_curve_plot(y_test_tensor, y_test_pred)

In [None]:
mlflow.end_run()