### **Importación de Librerías**

In [None]:
import json
import numpy as np
import os
import pandas as pd
import pprint
import sys

from constants import *
from utils.metrics import compute_metrics
from utils.mlflow_logger import MLflowLogger
from utils.plots import confusion_matrix_plot

from collections import defaultdict
from sklearn.metrics import classification_report

Ya chequeé que los IDs de NiNis de train y test sean los mismos para todas las simulaciones,
entonces basta con elegir los de una simulación cualquiera.

In [None]:
base_path = '/home/basbenja/Facultad/TrabajoFinal/'

ninis_ids_train_path = os.path.join(base_path, 'ninis_ids_train.json')
ninis_ids_test_path  = os.path.join(base_path, 'ninis_ids_test.json')

with open(ninis_ids_train_path, 'r') as f:
    ninis_ids_train = json.load(f)['ninis_ids_train']
with open(ninis_ids_test_path, 'r') as f:
    ninis_ids_test = json.load(f)['ninis_ids_test']

assert(len(ninis_ids_train) == 1000)
assert(len(ninis_ids_test) == 2500)

type3_ids_train = ninis_ids_train
type3_ids_test  = ninis_ids_test

### **Configuramos Stata**

In [None]:
sys.path.append(os.path.join(STATA_PATH, 'utilities'))
import pystata
pystata.config.init('mp', splash=False)

### **Carga de ajustes**

In [None]:
with open("config.json", "r") as f:
    config = json.load(f)

pprint.pp(config)

GROUP = "Grupo" + str(config['group'])
SIMULATION = "Simulacion" + str(config['simulation'])
METRICS = config['metrics']
BETA = config['beta']
LOG_TO_MLFLOW = (config['log_to_mlflow'] == "True")
COMPARISON = config['comparison']

In [None]:
GROUP_DIR = os.path.join(DATA_DIR, GROUP)
GROUP_PARAMS_FILE = os.path.join(GROUP_DIR, f"params_{GROUP}.json")
if os.path.exists(GROUP_PARAMS_FILE):
    with open(GROUP_PARAMS_FILE, 'r') as f:
        group_params = json.load(f)
else:
    print(f"Group params file not found: {GROUP_PARAMS_FILE}")

REQ_PERIODS = group_params['first_tr_period'] - 1
TEMP_FEATS = [f'y(t-{i})' for i in range(REQ_PERIODS, 0, -1)]
STAT_FEATS = ['inicio_prog']
FEATS = STAT_FEATS + TEMP_FEATS

N_PER_DEP = group_params['n_per_dep']

### **Carga de datos**

In [None]:
GROUP_DIR = os.path.join(DATA_DIR, GROUP)
stata_filepath = os.path.join(GROUP_DIR, SIMULATION + ".dta")
if os.path.exists(stata_filepath):
    df = pd.read_stata(stata_filepath)
else:
    print(f"File {stata_filepath} not found.")

### **Loguear parámetros a MLFlow**

In [None]:
print(f"Log to MLflow: {LOG_TO_MLFLOW}")
mlflow_logger = MLflowLogger(
    LOG_TO_MLFLOW,
    TRACKING_SERVER_URI,
    f"{EXPERIMENT_PREFIX}-{GROUP}-Comp{COMPARISON}",
    EXPERIMENT_TAGS
)

In [None]:
mlflow_logger.log_params({
    "group": GROUP,
    "simulation": SIMULATION,
    "filepath": stata_filepath,
    "required_periods": REQ_PERIODS,
    "n_per_dep": N_PER_DEP,
    "model_arch": "psm",
    "metrics": METRICS,
    "ups_max_count": group_params['ups_max_count'],
    "estimacion_logit": "tratados + ninis_train",
    "inferencia_y_matching": "tratados + controles + ninis_test"
})

### **Transformaciones generales**

**Primero, transformamos los datos a formato horizontal (esto lo hacemos una sola vez)**

In [None]:
df_wide = df.pivot(index='id', columns='t', values='y')
df_wide.columns = [f'y{int(col)}' for col in df_wide.columns]
df_wide.reset_index(inplace=True)

static_cols = ['id', 'inicio_prog', 'tratado', 'control']
df_static = df[static_cols].drop_duplicates(subset='id')

df_wide = pd.merge(df_static, df_wide, on='id')

**Separamos en tipos de individuos**

In [None]:
type1_df = df_wide[df_wide['tratado'] == 1]
type2_df = df_wide[df_wide['control'] == 1]
type3_df = df_wide[(df_wide['tratado'] == 0) & (df_wide['control'] == 0)]

type3_df_train = type3_df.loc[type3_ids_train]
type3_df_test  = type3_df.loc[type3_ids_test]

**Obtenemos diferentes cohortes**

In [None]:
treatment_starts = type1_df['inicio_prog'].unique()

### **Lo hacemos para una cohorte**

In [None]:
# con "estimates save logit_model" se guarda el modelo en un archivo llamdo
# logit_model. El "replace" al final lo sobreescribe si ya existe.
stata_code_estimate_logit = '''
qui ds y*
qui local vars `r(varlist)'
qui logit tratado `vars'
estimates save logit_model, replace
'''

# - Con "estimates use logit_model" usamos el modelo guardado anteriormente
# - Con el noreplacement fuerzo a que me elija un control distinto por cada tratado.
#   Esto es necesario porque en nuestros datos generamos la misma cantidad de controles
#   que de tratados en cada cohorte. Si no pusiera esto y un individuo resultara elegido
#   como de control para varios tratados, me degrada el F1 porque yo estoy esperando
#   que me elija la mayor cantidad de controles
stata_code_infer_logit = '''
estimates use logit_model
predict propensity_score, pr
psmatch2 tratado, pscore(propensity_score) neighbor(1) common noreplacement
qui drop _treated _nn _pscore `vars'
qui rename _weight wlogit
'''

In [None]:
def select_y_columns(row, inicio_prog=None):
    if not inicio_prog:
        inicio_prog = int(row['inicio_prog'])
    start = inicio_prog - REQ_PERIODS
    end   = inicio_prog - 1
    selected_cols = [f'y{t}' for t in range(start, end+1)]
    return row[['id', 'inicio_prog', 'tratado', 'control'] + selected_cols]

In [None]:
avg_metrics = defaultdict(list)
for tr_start in treatment_starts:
    print(f"Inicio de programa: {tr_start}")
    inicio_prog_folder = f"inicio_prog_{tr_start}"

    type1_in_cohort_df = type1_df[type1_df['inicio_prog'] == tr_start].apply(select_y_columns, axis=1)
    type2_in_cohort_df = type2_df[type2_df['inicio_prog'] == tr_start].apply(select_y_columns, axis=1)

    type3_df_train_for_cohort = type3_df_train.copy()
    type3_df_test_for_cohort  = type3_df_test.copy()
    type3_df_train_for_cohort['inicio_prog'] = tr_start
    type3_df_test_for_cohort ['inicio_prog'] = tr_start
    type3_df_train_for_cohort = type3_df_train_for_cohort.apply(select_y_columns, axis=1)
    type3_df_test_for_cohort  = type3_df_test_for_cohort. apply(select_y_columns, axis=1)

    logit_weights_df = pd.concat([type1_in_cohort_df, type3_df_train_for_cohort])
    logit_infer_df   = pd.concat([type1_in_cohort_df, type2_in_cohort_df, type3_df_test_for_cohort])

    true_1_ids = logit_infer_df[logit_infer_df['control'] == 1]['id'].to_list()
    true_0_ids = logit_infer_df[(logit_infer_df['control'] == 0) & (logit_infer_df['tratado'] == 0)]['id'].to_list()

    print("    Calculando pesos de la logit...")
    # Calculamos los pesos de la logit
    pystata.stata.pdataframe_to_data(logit_weights_df, force=True)
    pystata.stata.run(stata_code_estimate_logit, quietly=True)
    logit_model_path = os.path.join(os.getcwd(), 'logit_model.ster')
    mlflow_logger.log_artifact(logit_model_path, inicio_prog_folder)

    print("    Haciendo la inferencia...")
    # Hacemos la inferencia
    pystata.stata.pdataframe_to_data(logit_infer_df, force=True)
    pystata.stata.run(stata_code_infer_logit, quietly=True)
    df_psm = pystata.stata.pdataframe_from_data()

    try:
        os.remove(logit_model_path)
    except FileNotFoundError:
        print(f"File '{logit_model_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

    print("    Obteniendo resultados...")
    treated_df_psm     = df_psm[df_psm['tratado'] == 1]
    not_treated_df_psm = df_psm[df_psm['tratado'] == 0]

    # Los individudos identificados como control son los que aparecen en la columna _n1
    # del DataFrame. Notar que este _n1 hace referencia a la columna _id, NO a id.
    control_ids_psm = treated_df_psm['_n1']
    control_df_psm  = not_treated_df_psm[not_treated_df_psm['_id'].isin(control_ids_psm)]
    control_in_cohort_ids_pred = control_df_psm['id'].to_list()

    ninis_df_psm = not_treated_df_psm[~not_treated_df_psm['_id'].isin(control_ids_psm)]
    ninis_ids_pred = ninis_df_psm['id'].to_list()

    pred_0_ids = ninis_ids_pred
    pred_1_ids = control_in_cohort_ids_pred

    all_ids = list(set(true_0_ids + true_1_ids + pred_0_ids + pred_1_ids))

    # Create true and predicted label arrays
    y_true = [0 if id in true_0_ids else 1 for id in all_ids]
    y_pred = [0 if id in pred_0_ids else 1 for id in all_ids]

    fig, ax, confusion_dict = confusion_matrix_plot(y_true, y_pred)
    mlflow_logger.log_plot(fig, "confusion_matrix_plot.png", inicio_prog_folder)
    mlflow_logger.log_json(confusion_dict, "confusion_dict.json", inicio_prog_folder)

    report_dict = classification_report(y_true, y_pred, output_dict=True)
    mlflow_logger.log_json(report_dict, f"classification_report.json", inicio_prog_folder)

    cohort_metrics_dict = compute_metrics(METRICS, y_true, y_pred)
    for metric, value in cohort_metrics_dict.items():
        avg_metrics[metric].append(value)
    print("-------------------------------------------------------------")

In [None]:
for metric, values in avg_metrics.items():
    values_array = np.array(values)
    metric_mean = values_array.mean()
    avg_metrics[metric] = metric_mean
    if metric == "f1_score":
        mlflow_logger.log_param("cohorts_avg_f1", metric_mean)
    else:
        mlflow_logger.log_param(f"cohort_avg_{metric}", metric_mean)

In [None]:
mlflow_logger.end_run()