### **Importación de Librerías**

In [None]:
import json
import os
import pandas as pd
import pprint
import sys

from constants import *
from utils.mlflow_logger import MLflowLogger
from utils.load_data import transform
from utils.plots import confusion_matrix_plot

### **Configuramos Stata**

In [None]:
sys.path.append(os.path.join(STATA_PATH, 'utilities'))
import pystata
# NOTA: el splash solo aparece la primera vez que se carga pystata. O sea, si esta
# celda se ejecuta varias veces, no se vuelve a mostrar el splash.s
pystata.config.init('mp', splash=True)

In [None]:
with open("config.json", "r") as f:
    config = json.load(f)

pprint.pp(config)

GROUP = "Grupo" + str(config['group'])
SIMULATION = "Simulacion" + str(config['simulation'])
REQ_PERIODS = config['required_periods']
METRICS = config['metrics']
BETA = config['beta']
LOG_TO_MLFLOW = (config['log_to_mlflow'] == "True")

In [None]:
# print(f"Log to MLflow: {LOG_TO_MLFLOW}")
# mlflow_logger = MLflowLogger(
#     LOG_TO_MLFLOW,
#     TRACKING_SERVER_URI,
#     EXPERIMENT_NAME + "-" + GROUP,
#     EXPERIMENT_TAGS
# )

In [None]:
GROUP_DIR = os.path.join(DATA_DIR, GROUP)
stata_filepath = os.path.join(GROUP_DIR, SIMULATION + ".dta")
if os.path.exists(stata_filepath):
    df_orig = pd.read_stata(stata_filepath)
else:
    print(f"File {stata_filepath} not found.")

In [None]:
stata_code = '''
qui ds yt*
qui local vars `r(varlist)'
qui logit tratado `vars'
predict pscore, pr
psmatch2 tratado, pscore(pscore) neighbor(1) common
qui drop _treated _nn `vars'
qui rename _weight wlogit
'''

In [None]:
def get_results(df_stata):
    pass

In [None]:
df_treated = df_orig[df_orig['tratado'] == 1]

df_control = df_orig[df_orig['control'] == 1]
control_ids = df_control['id'].unique().tolist()

df_not_treated = df_orig[df_orig['tratado'] == 0]

df_nini = df_orig[(df_orig['tratado'] == 0) & (df_orig['control'] == 0)]
nini_ids = df_nini['id'].unique().tolist()

treatment_starts = df_orig['inicio_prog'][df_orig['inicio_prog'] != 0].unique()

for tr_start in treatment_starts[:1]:
    df_treated_in_cohort = df_treated[df_treated['inicio_prog'] == tr_start]

    control_in_cohort = df_control[df_control['inicio_prog'] == tr_start]
    control_in_cohort_ids = control_in_cohort['id'].unique().tolist()

    transformed_data = []
    for df in [df_treated_in_cohort, df_not_treated]:
        for id, data in df.groupby('id'):
            transformed_data.append(transform(id, data, tr_start, REQ_PERIODS))

    df_transformed = pd.DataFrame(transformed_data)
    df_transformed = df_transformed.drop(columns=['inicio_prog', 'control'])
    df_transformed.columns = [
        col.lower().replace('-', '').replace('(', '').replace(')', '') for col in df_transformed.columns
    ]

    pystata.stata.pdataframe_to_data(df_transformed, force=True)
    pystata.stata.run(stata_code)
    df_stata = pystata.stata.pdataframe_from_data()

In [None]:
df_stata.head(50)

* `-d df_transformed`: carga el dataframe df_transformed en Stata
* `-force`: fuerza la carga del dataframe, incluso si ya existe uno con el mismo nombre
* `-doutd df_stata`: guarda el dataframe de Stata como df_stata, y se puede acceder
desde Python

Sobre el comando `psmatch2` de Stata: [Stata Documentation for the psmatch2 command](https://www.pep-net.org/sites/pep-net.org/files/typo3doc/pdf/Training_Material/statadoc.pdf)

In [None]:
# %%stata -d df_transformed -force -doutd df_stata -qui
# * Listar columnas que tengan el patron yt*
# qui ds yt*

# * El resultado del comando ds es guardado en la macro r(varlist). Guardamos este
# * resultado en una variable local llamada vars
# qui local vars `r(varlist)'

# * Regresión logística con las variables temporales
# qui logit tratado `vars'

# * Predecimos la probabilidad de ser tratado de cada individuo y la almacenamos en una
# * nueva variable llamada prob. Esto agrega una nueva columna al dataset llamada prob

# * Predict probability of being treated, store it in a new variable called prob
# predict pscore, pr

# * Histograma de la probabilidad de ser tratado
# * histogram prob, by(tratado)

# * Hacemos el matching
# psmatch2 tratado, pscore(pscore) neighbor(1) common

# * Renombramos columns con nombres más interpretables
# qui drop _treated _nn `vars'
# qui rename _weight wlogit

In [None]:
treated = df_stata[df_stata['tratado'] == 1]

# Los individudos identificados como control son los que aparecen en la columna _n1
# del DataFrame. Notar que este _n1 hace referencia a la columna _id, NO a id.
control_ids_stata = treated['_n1']
control_stata = df_stata[df_stata['_id'].isin(control_ids_stata)]
control_in_cohort_ids_pred = control_stata['id'].tolist()

not_treated = df_stata[df_stata['tratado'] == 0]
not_treated_ids = not_treated['id'].unique()
nini_ids_pred = list(set(not_treated_ids) - set(control_in_cohort_ids_pred))

In [None]:
true_0_ids = nini_ids + list(set(control_ids) - set(control_in_cohort_ids_pred))
pred_0_ids = nini_ids_pred

true_1_ids = control_in_cohort_ids
pred_1_ids = control_in_cohort_ids_pred

all_ids = list(set(true_0_ids + true_1_ids + pred_0_ids + pred_1_ids))

# Create true and predicted label arrays
y_true = [0 if id in true_0_ids else 1 for id in all_ids]
y_pred = [0 if id in pred_0_ids else 1 for id in all_ids]

In [None]:
fig, ax = confusion_matrix_plot(y_true, y_pred)