### **Importación de Librerías**

In [None]:
import json
import os
import pandas as pd
import pprint
import sys

from constants import *
from utils.metrics import compute_metrics
from utils.mlflow_logger import MLflowLogger
from utils.plots import confusion_matrix_plot

from sklearn.metrics import classification_report

### **Configuramos Stata**

In [None]:
sys.path.append(os.path.join(STATA_PATH, 'utilities'))
import pystata
# NOTA: el splash solo aparece la primera vez que se carga pystata. O sea, si esta
# celda se ejecuta varias veces, no se vuelve a mostrar el splash
pystata.config.init('mp', splash=True)

### **Carga de ajustes**

In [None]:
with open("config.json", "r") as f:
    config = json.load(f)

pprint.pp(config)

GROUP = "Grupo" + str(config['group'])
SIMULATION = "Simulacion" + str(config['simulation'])
METRICS = config['metrics']
BETA = config['beta']
LOG_TO_MLFLOW = (config['log_to_mlflow'] == "True")
COMPARISON = config['comparison']

In [None]:
GROUP_DIR = os.path.join(DATA_DIR, GROUP)
GROUP_PARAMS_FILE = os.path.join(GROUP_DIR, f"params_{GROUP}.json")
if os.path.exists(GROUP_PARAMS_FILE):
    with open(GROUP_PARAMS_FILE, 'r') as f:
        group_params = json.load(f)
else:
    print(f"Group params file not found: {GROUP_PARAMS_FILE}")

REQ_PERIODS = group_params['first_tr_period'] - 1
TEMP_FEATS = [f'y(t-{i})' for i in range(REQ_PERIODS, 0, -1)]
STAT_FEATS = ['inicio_prog']
FEATS = STAT_FEATS + TEMP_FEATS

N_PER_DEP = group_params['n_per_dep']

### **Carga de datos**

In [None]:
GROUP_DIR = os.path.join(DATA_DIR, GROUP)
stata_filepath = os.path.join(GROUP_DIR, SIMULATION + ".dta")
if os.path.exists(stata_filepath):
    df = pd.read_stata(stata_filepath)
else:
    print(f"File {stata_filepath} not found.")

### **Loguear parámetros a MLFlow**

In [None]:
# print(f"Log to MLflow: {LOG_TO_MLFLOW}")
# mlflow_logger = MLflowLogger(
#     LOG_TO_MLFLOW,
#     TRACKING_SERVER_URI,
#     f"{EXPERIMENT_PREFIX}-{GROUP}-Comp{COMPARISON}",
#     EXPERIMENT_TAGS
# )

In [None]:
# mlflow_logger.log_params({
#     "group": GROUP,
#     "simulation": SIMULATION,
#     "filepath": stata_filepath,
#     "required_periods": REQ_PERIODS,
#     "n_per_dep": N_PER_DEP,
#     "model_arch": "psm",
#     "metrics": METRICS,
#     "ups_max_count": group_params['ups_max_count']
# })

### **Carga de IDs de NiNis**
Los IDs de los NiNis fueron selecionados aleatoriamente en los primeros experimentos
hechos con LSTM. Necesitamos traerlos para hacer las comparaciones con los mismos
conjuntos de datos.

In [None]:
import mlflow

mlflow.set_tracking_uri(TRACKING_SERVER_URI)

experiment_name = f"{EXPERIMENT_PREFIX}-{GROUP}-Comp{COMPARISON}"
experiment = mlflow.get_experiment_by_name(experiment_name)

runs_list = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string=(
        f"params.simulation = '{SIMULATION}' AND params.model_arch = 'lstm_v2'"
    ),
    output_format="list"
)

run = runs_list[0]
run_id = run.info.run_id
artifact_uri = run.info.artifact_uri

ninis_ids_train = mlflow.artifacts.load_dict(artifact_uri + "/ninis_ids_train.json")
ninis_ids_test  = mlflow.artifacts.load_dict(artifact_uri + "/ninis_ids_test.json")

assert(ninis_ids_train['amount'] == 1000)
assert(ninis_ids_test ['amount'] == 2500)

type3_ids_train = ninis_ids_train['ninis_ids_train']
type3_ids_test  = ninis_ids_test ['ninis_ids_test']

### **Transformaciones generales**

**Primero, transformamos los datos a formato horizontal (esto lo hacemos una sola vez)**

In [None]:
df_wide = df.pivot(index='id', columns='t', values='y')
df_wide.columns = [f'y{int(col)}' for col in df_wide.columns]
df_wide.reset_index(inplace=True)

static_cols = ['id', 'inicio_prog', 'tratado', 'control']
df_static = df[static_cols].drop_duplicates(subset='id')

df_wide = pd.merge(df_static, df_wide, on='id')

df_wide

**Separamos en tipos de individuos**

In [None]:
type1_df = df_wide[df_wide['tratado'] == 1]
type2_df = df_wide[df_wide['control'] == 1]
type3_df = df_wide[(df_wide['tratado'] == 0) & (df_wide['control'] == 0)]

type3_df_train = type3_df.loc[type3_ids_train]
type3_df_test  = type3_df.loc[type3_ids_test]

**Obtenemos diferentes cohortes**

In [None]:
treatment_starts = type1_df['inicio_prog'].unique()

### **Lo hacemos para una cohorte**

In [None]:
# con "estimates save logit_model" se guarda el modelo en un archivo llamdo
# logit_model. El "replace" al final lo sobreescribe si ya existe.
stata_code_estimate_logit = '''
qui ds y*
qui local vars `r(varlist)'
qui logit tratado `vars'
estimates save logit_model, replace
'''

# con "estimates use logit_model" usamos el modelo guardado anteriormente
stata_code_infer_logit = '''
estimates use logit_model
predict pscore, pr
psmatch2 tratado, pscore(pscore) neighbor(1) common
qui drop _treated _nn _pscore `vars'
qui rename _weight wlogit
'''

In [None]:
def select_y_columns(row):
    inicio_prog = int(row['inicio_prog'])
    start = inicio_prog - REQ_PERIODS
    end   = inicio_prog - 1
    selected_cols = [f'y{t}' for t in range(start, end+1)]
    return row[['id', 'inicio_prog', 'tratado', 'control'] + selected_cols]

In [None]:
for tr_start in treatment_starts:
    print(f"Inicio de programa: {tr_start}")

    type1_in_cohort_df = type1_df[type1_df['inicio_prog'] == tr_start].apply(select_y_columns, axis=1)
    type2_in_cohort_df = type2_df[type2_df['inicio_prog'] == tr_start].apply(select_y_columns, axis=1)

    type3_df_train_for_cohort = type3_df_train.copy()
    type3_df_test_for_cohort  = type3_df_test.copy()
    type3_df_train_for_cohort['inicio_prog'] = tr_start
    type3_df_test_for_cohort ['inicio_prog'] = tr_start
    type3_df_train_for_cohort = type3_df_train_for_cohort.apply(select_y_columns, axis=1)
    type3_df_test_for_cohort  = type3_df_test_for_cohort. apply(select_y_columns, axis=1)

    logit_weights_df = pd.concat([type1_in_cohort_df, type3_df_train_for_cohort])
    logit_infer_df   = pd.concat([type1_in_cohort_df, type2_in_cohort_df, type3_df_test_for_cohort])

    true_1_ids = logit_infer_df[logit_infer_df['control'] == 1]['id'].to_list()
    true_0_ids = logit_infer_df[(logit_infer_df['control'] == 0) & (logit_infer_df['tratado'] == 0)]['id'].to_list()

    print("    Calculando pesos de la logit...")
    # Calculamos los pesos de la logit
    pystata.stata.pdataframe_to_data(logit_weights_df, force=True)
    pystata.stata.run(stata_code_estimate_logit, quietly=True)

    print("    Haciendo la inferencia...")
    # Hacemos la inferencia
    pystata.stata.pdataframe_to_data(logit_infer_df, force=True)
    pystata.stata.run(stata_code_infer_logit, quietly=True)
    df_psm = pystata.stata.pdataframe_from_data()

    print("    Obteniendo resultados...")
    treated_df_psm     = df_psm[df_psm['tratado'] == 1]
    not_treated_df_psm = df_psm[df_psm['tratado'] == 0]

    # Los individudos identificados como control son los que aparecen en la columna _n1
    # del DataFrame. Notar que este _n1 hace referencia a la columna _id, NO a id.
    control_ids_psm = treated_df_psm['_n1']
    control_df_psm  = not_treated_df_psm[not_treated_df_psm['_id'].isin(control_ids_psm)]
    control_in_cohort_ids_pred = control_df_psm['id'].to_list()

    ninis_df_psm = not_treated_df_psm[~not_treated_df_psm['_id'].isin(control_ids_psm)]
    ninis_ids_pred = ninis_df_psm['id'].to_list()

    pred_0_ids = ninis_ids_pred
    pred_1_ids = control_in_cohort_ids_pred

    all_ids = list(set(true_0_ids + true_1_ids + pred_0_ids + pred_1_ids))

    # Create true and predicted label arrays
    y_true = [0 if id in true_0_ids else 1 for id in all_ids]
    y_pred = [0 if id in pred_0_ids else 1 for id in all_ids]

    fig, ax = confusion_matrix_plot(y_true, y_pred)
    # mlflow_logger.log_plot(fig, f"confusion_matrix_plot_inicio_prog_{tr_start}.png")

    report_str = classification_report(y_true, y_pred)
    print(f"    Métricas de la clasificación:\n {report_str}")
    # mlflow_logger.log_json(
    #     classification_report(y_true, y_pred, output_dict=True),
    #     f"classification_report_inicio_prog_{tr_start}.json"
    # )

    metrics_dict = compute_metrics(METRICS, y_true, y_pred)
    print(f"    Métricas:")
    for metric, value in metrics_dict.items():
        print(f"        - {metric}: {value}")
        # mlflow_logger.log_param(metric value)
    print("-------------------------------------------------------------")

In [None]:
tr_start = treatment_starts[0]

type1_in_cohort_df = type1_df[type1_df['inicio_prog'] == tr_start].apply(select_y_columns, axis=1)
type2_in_cohort_df = type2_df[type2_df['inicio_prog'] == tr_start].apply(select_y_columns, axis=1)

type3_df_train['inicio_prog'] = tr_start
type3_df_test ['inicio_prog'] = tr_start
type3_df_train = type3_df_train.apply(select_y_columns, axis=1)
type3_df_test  = type3_df_test. apply(select_y_columns, axis=1)

In [None]:
print(len(type1_in_cohort_df), len(type3_df_train))
print(len(type2_in_cohort_df), len(type3_df_test))

In [None]:
logit_weights_df = pd.concat([type1_in_cohort_df, type3_df_train])
logit_infer_df   = pd.concat([type1_in_cohort_df, type2_in_cohort_df, type3_df_test])

In [None]:
true_1_ids = logit_infer_df[logit_infer_df['control'] == 1]['id'].to_list()
true_0_ids = logit_infer_df[(logit_infer_df['control'] == 0) & (logit_infer_df['tratado'] == 0)]['id'].to_list()

In [None]:
# Calculamos los pesos de la logit
pystata.stata.pdataframe_to_data(logit_weights_df, force=True)
pystata.stata.run(stata_code_estimate_logit)

# Hacemos la inferencia
pystata.stata.pdataframe_to_data(logit_infer_df, force=True)
pystata.stata.run(stata_code_infer_logit)
df_psm = pystata.stata.pdataframe_from_data()

Sobre el comando `psmatch2` de Stata: [Stata Documentation for the psmatch2 command](https://www.pep-net.org/sites/pep-net.org/files/typo3doc/pdf/Training_Material/statadoc.pdf)

**Vemos los controles identificados por el PSM**

In [None]:
treated_df_psm     = df_psm[df_psm['tratado'] == 1]
not_treated_df_psm = df_psm[df_psm['tratado'] == 0]

# Los individudos identificados como control son los que aparecen en la columna _n1
# del DataFrame. Notar que este _n1 hace referencia a la columna _id, NO a id.
control_ids_psm = treated_df_psm['_n1']
control_df_psm  = not_treated_df_psm[not_treated_df_psm['_id'].isin(control_ids_psm)]
control_in_cohort_ids_pred = control_df_psm['id'].to_list()

ninis_df_psm = not_treated_df_psm[~not_treated_df_psm['_id'].isin(control_ids_psm)]
ninis_ids_pred = ninis_df_psm['id'].to_list()

In [None]:
pred_0_ids = ninis_ids_pred
pred_1_ids = control_in_cohort_ids_pred

all_ids = list(set(true_0_ids + true_1_ids + pred_0_ids + pred_1_ids))

# Create true and predicted label arrays
y_true = [0 if id in true_0_ids else 1 for id in all_ids]
y_pred = [0 if id in pred_0_ids else 1 for id in all_ids]

In [None]:
fig, ax = confusion_matrix_plot(y_true, y_pred)
fig.show()

# mlflow_logger.log_plot(fig, f"confusion_matrix_plot_inicio_prog_{tr_start}.png")

In [None]:
report_str = classification_report(y_true, y_pred)
print(report_str)

# mlflow_logger.log_json(
#     classification_report(y_true, y_pred, output_dict=True),
#     f"classification_report_inicio_prog_{tr_start}.json"
# )

In [None]:
metrics_dict = compute_metrics(METRICS, y_true, y_pred)

for metric, value in metrics_dict.items():
    print(f"{metric}: {value}")
    # mlflow_logger.log_param(metric value)