<b> Replication Notebooks for [The Impact of High School Financial Education: Evidence from a Large-Scale Evaluation in Brazil]( https://doi.org/10.1257/app.20150149) </b>

By [Charlie Zhang](mailto:charlie.zhang@georgetown.edu) and Hannah Roemer

In [1]:
import os
os.chdir("../")

import pandas as pd
import numpy as np
import statsmodels as sm
import statsmodels.formula.api as smf
import scipy

import warnings
warnings.filterwarnings("ignore")

In [2]:
panel_final = os.getcwd() + "/Data/school_intervention_panel_final.dta"

In [3]:
pf = pd.read_stata(panel_final)
pf.head(5)

Unnamed: 0,id_geral,cd_escola,nm_uf_bl,matriculas,docentes,abandonona1sriemdio,aprovaona1sriemdio,treatment,pair_all,treatment_workshop,...,dumm_rp_18p_fup,dumm_rp_19p_fup,dumm_rp_21p_fup,dumm_rp_23p_fup,dumm_formal_saving_fup,dumm_rp_33p_fup,dumm_rp_34p_fup,dumm_rp_36p_fup,dumm_rp_37p_fup,dumm_rp_41p_fup
0,1,17002648.0,TOCANTINS,273.0,34.0,6.8,76.7,yes,17017.0,,...,,,,,,,,,,
1,1,17002648.0,TOCANTINS,273.0,34.0,6.8,76.7,yes,17017.0,,...,,,,,,,,,,
2,10,17018390.0,TOCANTINS,641.0,29.0,4.2,90.7,no,17008.0,,...,1.0,1.0,,,1.0,1.0,0.0,0.0,0.0,
3,10,17018390.0,TOCANTINS,641.0,29.0,4.2,90.7,no,17008.0,,...,,,,,,,,,,
4,100,33002614.0,RIO DE JANEIRO,199.0,35.0,13.3,80.0,yes,33031.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,


In [4]:
pf.columns

Index(['id_geral', 'cd_escola', 'nm_uf_bl', 'matriculas', 'docentes',
       'abandonona1sriemdio', 'aprovaona1sriemdio', 'treatment', 'pair_all',
       'treatment_workshop',
       ...
       'dumm_rp_18p_fup', 'dumm_rp_19p_fup', 'dumm_rp_21p_fup',
       'dumm_rp_23p_fup', 'dumm_formal_saving_fup', 'dumm_rp_33p_fup',
       'dumm_rp_34p_fup', 'dumm_rp_36p_fup', 'dumm_rp_37p_fup',
       'dumm_rp_41p_fup'],
      dtype='object', length=212)

In [5]:
variables = pd.read_stata(panel_final, iterator="True")
labels = variables.variable_labels()

In [6]:
variables.value_labels()

{'rp_41p_fup': {1: 'only expenses',
  2: 'only income',
  3: 'both',
  4: "don't know"},
 'rp_37p': {1: 'more than today',
  2: 'the sames as today',
  3: 'less than today',
  4: "don't know"},
 'rp_36p': {1: 'bank a', 2: 'bank b', 3: "don't know"},
 'rp_34p': {1: 'yes', 2: 'no'},
 'rp_33p': {1: 'yes', 2: 'no'},
 'rp_23p': {1: 'yes', 2: 'no', 3: "don't know"},
 'rp_21p': {1: 'yes', 2: 'no', 3: "don't know"},
 'rp_19p': {1: 'yes', 2: 'no', 3: "don't know"},
 'rp_18p': {1: 'yes', 2: 'no', 3: "don't know"},
 'rp_14p': {1: 'yes, always', 2: 'yes, somtimes', 3: 'no', 4: "don't know"},
 'rp_13p_fup': {1: 'nothing',
  2: 'up to 25%',
  3: 'between 26% and 50%',
  4: 'between 51% and 75%',
  5: 'between 76% and 100%'},
 'rp_09p': {1: 'formal employee',
  2: 'informal employee',
  3: 'businessperson',
  4: 'independent professional',
  5: 'self-employed',
  6: 'government employee',
  7: 'housewife',
  8: 'retired',
  9: 'pensioner',
  10: 'unemployed/no income',
  11: 'other'},
 'rp_08p': {1: 

## Table 1

In [7]:
# Line 41-43
pf_t1 = (pf[(pf["round"] == "yes")& (pf["treatment"].isna() == False)]
      .sort_values(by=["cd_escola", "id_geral"])
      .reset_index()
      .drop("index", axis=1))
pf_t1.head(5)

Unnamed: 0,id_geral,cd_escola,nm_uf_bl,matriculas,docentes,abandonona1sriemdio,aprovaona1sriemdio,treatment,pair_all,treatment_workshop,...,dumm_rp_18p_fup,dumm_rp_19p_fup,dumm_rp_21p_fup,dumm_rp_23p_fup,dumm_formal_saving_fup,dumm_rp_33p_fup,dumm_rp_34p_fup,dumm_rp_36p_fup,dumm_rp_37p_fup,dumm_rp_41p_fup
0,10166,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,,,,,,,,,,
1,10439,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,,,,,,,,,,
2,10448,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,,,,,,,,,,
3,10568,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,,,,,,,,,,
4,13397,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,,,,,,,,,,


In [8]:
test = [
    "female", "dumm_rp_08_bl", "dumm_rp_09_bl", "dumm_rp_24_bl",
    "dumm_rp_14_bl", "dumm_rp_23_bl", "vl_proficiencia_bl"
]
aluno = [
    "dumm_rp_49_bl", "dumm_rp_50_bl", "dumm_rp_65A_bl", "poupar_final2_bl",
    "dumm_rp_64A_bl", "dumm_negotiates_bl", "autonomia_final2_bl"
]
school = [
    "matriculas", "docentes", "abandonona1sriemdio", "aprovaona1sriemdio"
]
xvars = test + aluno

In [9]:
def ttest(stats, var, alpha=0.05):

    mean_1, mean_2 = stats[(var, "mean")]
    std_1, std_2 = stats[(var, "std")]
    n_1, n_2 = stats[(var, "count")]
    se_1, se_2 = std_1 / np.sqrt(n_1), std_2 / np.sqrt(n_2)
    sed = np.sqrt(se_1**2.0 + se_1**2.0)
    t_stat = (mean_1 - mean_2) / sed
    df = n_1 + n_2 - 2
    cv = scipy.stats.t.ppf(1.0 - alpha, df)
    p = (1.0 - scipy.stats.t.cdf(abs(t_stat), df)) * 2.0
    return p

In [10]:
t1p1 = pd.DataFrame()
for var in school:
    temp = ["treatment", "cd_escola"]
    temp.append(var)
    pf_temp = (pf_t1[temp].drop_duplicates()[["treatment", var]])
    stats = pf_temp.groupby("treatment").agg({var: ["mean", "std", "count"]})
    stats[(var, "t-test")] = ttest(stats, var)
    t1p1 = pd.concat([t1p1, stats], axis=1).round(4)

display(t1p1)

Unnamed: 0_level_0,matriculas,matriculas,matriculas,matriculas,docentes,docentes,docentes,docentes,abandonona1sriemdio,abandonona1sriemdio,abandonona1sriemdio,abandonona1sriemdio,aprovaona1sriemdio,aprovaona1sriemdio,aprovaona1sriemdio,aprovaona1sriemdio
Unnamed: 0_level_1,mean,std,count,t-test,mean,std,count,t-test,mean,std,count,t-test,mean,std,count,t-test
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
no,642.8363,461.6997,452,0.2156,37.4949,24.0912,392,0.6504,11.0351,11.2233,447,0.3488,68.051,16.0285,447,0.7751
yes,680.8932,514.618,440,0.2156,38.2751,25.7419,378,0.6504,11.7389,11.8174,435,0.3488,67.7446,15.9359,435,0.7751


In [11]:
t1p2 = pd.DataFrame()
for var in xvars:
    temp = ["treatment", "id_geral"]
    temp.append(var)
    pf_temp = (pf_t1[temp].drop_duplicates()[["treatment", var]])
    stats = pf_temp.groupby("treatment").agg({var: ["mean", "std", "count"]})
    stats[(var, "t-test")] = ttest(stats, var)
    t1p2 = pd.concat([t1p2, stats], axis=1).round(3)

display(t1p2.T)

Unnamed: 0,treatment,no,yes
female,mean,0.549,0.563
female,std,0.498,0.496
female,count,11954.0,11529.0
female,t-test,0.034,0.034
dumm_rp_08_bl,mean,0.446,0.435
dumm_rp_08_bl,std,0.497,0.496
dumm_rp_08_bl,count,11774.0,11350.0
dumm_rp_08_bl,t-test,0.111,0.111
dumm_rp_09_bl,mean,0.414,0.405
dumm_rp_09_bl,std,0.493,0.491


In [12]:
t1p2_cols = t1p2.columns.to_list()
t1p2_new_cols = []
for col in t1p2_cols:
    a, b = col
    a = labels[a]
    t1p2_new_cols.append((a,b))
t1p2.columns = pd.MultiIndex.from_tuples(t1p2_new_cols)

In [13]:
t1p2.T.round(2)

Unnamed: 0,treatment,no,yes
Student is female,mean,0.55,0.56
Student is female,std,0.5,0.5
Student is female,count,11954.0,11529.0
Student is female,t-test,0.03,0.03
Education of student's mother: At least some secondary,mean,0.45,0.44
Education of student's mother: At least some secondary,std,0.5,0.5
Education of student's mother: At least some secondary,count,11774.0,11350.0
Education of student's mother: At least some secondary,t-test,0.11,0.11
Education of student's father: At least some secondary,mean,0.41,0.41
Education of student's father: At least some secondary,std,0.49,0.49


## Table 2

In [14]:
# Line 199-201
pf_t2 = (pf[(pf["round"] == "no") & (pf["treatment"].isna() == False) &
            (pf["fu1_test"] == 1) | (pf["fu1_aluno"] == 1)]
         .sort_values(by=["cd_escola", "id_geral"])
         .reset_index()
         .drop("index",axis=1))
pf_t2.head(5)

Unnamed: 0,id_geral,cd_escola,nm_uf_bl,matriculas,docentes,abandonona1sriemdio,aprovaona1sriemdio,treatment,pair_all,treatment_workshop,...,dumm_rp_18p_fup,dumm_rp_19p_fup,dumm_rp_21p_fup,dumm_rp_23p_fup,dumm_formal_saving_fup,dumm_rp_33p_fup,dumm_rp_34p_fup,dumm_rp_36p_fup,dumm_rp_37p_fup,dumm_rp_41p_fup
0,10166,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,1.0,1.0,,1.0,1.0,1.0,0.0,1.0,0.0,
1,10439,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,
2,10448,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,
3,10568,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,
4,13397,17000386.0,TOCANTINS,414.0,42.0,9.6,65.2,yes,17006.0,,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,


In [15]:
pf_t2["female_coded"] = pf_t2["female_coded"].map({"yes": 1, "no": 0,})
pf_t2["female_coded"].value_counts()

1.0    11029
0.0     8685
Name: female_coded, dtype: int64

In [16]:
responsavel = ["p_employee_fup", "p_selfempl_fup", "p_other_fup"]
for var in responsavel:
    pf_temp = pf_t2[["treatment", "cd_escola", var]].dropna().reset_index().drop("index", axis=1)
    var_res = smf.ols(str(var) + "~treatment", data=pf_temp).fit(
        cov_type='cluster', cov_kwds={'groups': pf_temp["cd_escola"]})
    display(var_res.summary())

0,1,2,3
Dep. Variable:,p_employee_fup,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.04533
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,0.831
Time:,16:42:08,Log-Likelihood:,-9817.9
No. Observations:,13540,AIC:,19640.0
Df Residuals:,13538,BIC:,19650.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.4802,0.008,59.314,0.000,0.464,0.496
treatment[T.yes],0.0024,0.011,0.213,0.831,-0.020,0.025

0,1,2,3
Omnibus:,46608.087,Durbin-Watson:,1.925
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2256.631
Skew:,0.074,Prob(JB):,0.0
Kurtosis:,1.005,Cond. No.,2.71


0,1,2,3
Dep. Variable:,p_selfempl_fup,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.03395
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,0.854
Time:,16:42:08,Log-Likelihood:,-6226.6
No. Observations:,13540,AIC:,12460.0
Df Residuals:,13538,BIC:,12470.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.1796,0.005,33.476,0.000,0.169,0.190
treatment[T.yes],-0.0014,0.008,-0.184,0.854,-0.016,0.014

0,1,2,3
Omnibus:,3436.679,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6705.819
Skew:,1.676,Prob(JB):,0.0
Kurtosis:,3.808,Cond. No.,2.71


0,1,2,3
Dep. Variable:,p_other_fup,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.008471
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,0.927
Time:,16:42:08,Log-Likelihood:,-9091.8
No. Observations:,13540,AIC:,18190.0
Df Residuals:,13538,BIC:,18200.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3401,0.008,43.014,0.000,0.325,0.356
treatment[T.yes],-0.0010,0.011,-0.092,0.927,-0.023,0.021

0,1,2,3
Omnibus:,75465.707,Durbin-Watson:,1.919
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2375.496
Skew:,0.677,Prob(JB):,0.0
Kurtosis:,1.459,Cond. No.,2.71


Unnamed: 0,treatment,cd_escola,female_coded
0,1,17002648.0,0.0
1,1,17002648.0,0.0
2,0,17018390.0,0.0
3,0,17018390.0,0.0
4,1,33002614.0,1.0
...,...,...,...
70503,0,53007514.0,1.0
70504,1,33075212.0,1.0
70505,1,33075212.0,1.0
70506,0,35907637.0,1.0


In [23]:
pf_temp = pf[["treatment", "cd_escola", "female_coded"]]
pf_temp = pf_temp[pf_temp.female_coded != "missing"].reset_index().drop("index", axis=1)
pf_temp["female_coded"] = pf_temp["female_coded"].map({"yes": 1, "no": 0})
pf_temp["treatment"] = pf_temp["treatment"].map({"yes": 1, "no": 0})
pf_temp = pf_temp.dropna()
fc_res = smf.ols("female_coded~treatment", data=pf_temp).fit(cov_type='cluster', 
                                                             cov_kwds={'groups': pf_temp["cd_escola"]})
fc_res.summary()

0,1,2,3
Dep. Variable:,female_coded,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,5.573
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,0.0185
Time:,16:44:04,Log-Likelihood:,-50600.0
No. Observations:,70480,AIC:,101200.0
Df Residuals:,70478,BIC:,101200.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.5542,0.005,122.972,0.000,0.545,0.563
treatment[T.1],0.0155,0.007,2.361,0.018,0.003,0.028

0,1,2,3
Omnibus:,250188.183,Durbin-Watson:,0.566
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11746.753
Skew:,-0.25,Prob(JB):,0.0
Kurtosis:,1.063,Cond. No.,2.61


## Table 3

In [24]:
incl_vars = ["vl_proficiencia_fup", "treatment", "cd_escola"]
for rd in ["no", "yes"]:
    data = (pf[pf["round"] == rd][incl_vars]
            .dropna().reset_index().drop("index", axis=1))
    school_id = data["cd_escola"]
    t3_nc = smf.ols("vl_proficiencia_fup~treatment", data=data)
    t3_nc_res = t3_nc.fit(cov_type='cluster', cov_kwds={'groups': school_id})
    display(t3_nc_res.summary())

0,1,2,3
Dep. Variable:,vl_proficiencia_fup,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.021
Method:,Least Squares,F-statistic:,55.23
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,2.61e-13
Time:,16:44:20,Log-Likelihood:,-74895.0
No. Observations:,18276,AIC:,149800.0
Df Residuals:,18274,BIC:,149800.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,56.0501,0.425,131.941,0.000,55.217,56.883
treatment[T.yes],4.2661,0.574,7.432,0.000,3.141,5.391

0,1,2,3
Omnibus:,729.291,Durbin-Watson:,1.922
Prob(Omnibus):,0.0,Jarque-Bera (JB):,317.535
Skew:,0.039,Prob(JB):,1.12e-69
Kurtosis:,2.359,Cond. No.,2.65


0,1,2,3
Dep. Variable:,vl_proficiencia_fup,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,32.93
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,1.33e-08
Time:,16:44:20,Log-Likelihood:,-77954.0
No. Observations:,18953,AIC:,155900.0
Df Residuals:,18951,BIC:,155900.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,59.0446,0.409,144.341,0.000,58.243,59.846
treatment[T.yes],3.2938,0.574,5.739,0.000,2.169,4.419

0,1,2,3
Omnibus:,748.943,Durbin-Watson:,1.822
Prob(Omnibus):,0.0,Jarque-Bera (JB):,342.039
Skew:,0.089,Prob(JB):,5.34e-75
Kurtosis:,2.367,Cond. No.,2.63


In [25]:
pf = pd.read_stata(panel_final)
for rd in ["no", "yes"]:
    incl_vars_ctrl = [
        "vl_proficiencia_fup", "treatment", "cd_escola", "pair_all"
    ]
    data = (pf[incl_vars_ctrl][(
        pf["round"] == rd)].dropna().reset_index().drop("index", axis=1))
    data["treatment"] = data["treatment"].map({
        "yes": 1,
        "no": 0
    }).astype("float")
   
    flag_temp = pd.DataFrame(
        data.groupby("pair_all")["treatment"].mean().reset_index().rename(
            {"treatment": "flag"}, axis=1))
    data = data.merge(flag_temp, how="left", on="pair_all")

    pairall = []
    for par, flg in zip(data["pair_all"], data["flag"]):
        if (flg > 0) & (flg < 1):
            pairall.append(par)
        else:
            pairall.append(0)
    data["pair_all"] = pairall

    t3_c = smf.ols("vl_proficiencia_fup~treatment+C(pair_all)", data=data)
    t3_c_res = t3_c.fit(cov_type='cluster',
                        cov_kwds={'groups': data["cd_escola"]})
    display(t3_c_res.summary())

0,1,2,3
Dep. Variable:,vl_proficiencia_fup,R-squared:,0.182
Model:,OLS,Adj. R-squared:,0.164
Method:,Least Squares,F-statistic:,14670.0
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,0.0
Time:,16:44:26,Log-Likelihood:,-73257.0
No. Observations:,18276,AIC:,147300.0
Df Residuals:,17882,BIC:,150400.0
Df Model:,393,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,56.2995,1.090,51.665,0.000,54.164,58.435
C(pair_all)[T.17001.0],-0.6258,1.187,-0.527,0.598,-2.953,1.701
C(pair_all)[T.17002.0],-1.3422,2.519,-0.533,0.594,-6.279,3.594
C(pair_all)[T.17003.0],-1.5951,4.139,-0.385,0.700,-9.708,6.518
C(pair_all)[T.17004.0],-1.6417,3.071,-0.535,0.593,-7.661,4.377
C(pair_all)[T.17005.0],-4.7960,1.980,-2.422,0.015,-8.677,-0.915
C(pair_all)[T.17006.0],-3.6116,4.103,-0.880,0.379,-11.653,4.429
C(pair_all)[T.17007.0],1.4525,1.769,0.821,0.412,-2.014,4.919
C(pair_all)[T.17008.0],-0.3619,1.097,-0.330,0.742,-2.513,1.789

0,1,2,3
Omnibus:,173.569,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,118.183
Skew:,0.066,Prob(JB):,2.1700000000000002e-26
Kurtosis:,2.629,Cond. No.,91.2


0,1,2,3
Dep. Variable:,vl_proficiencia_fup,R-squared:,0.171
Model:,OLS,Adj. R-squared:,0.154
Method:,Least Squares,F-statistic:,277.3
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,0.0
Time:,16:44:27,Log-Likelihood:,-76290.0
No. Observations:,18953,AIC:,153400.0
Df Residuals:,18561,BIC:,156400.0
Df Model:,391,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,57.7120,1.092,52.863,0.000,55.572,59.852
C(pair_all)[T.17001.0],4.2310,1.069,3.957,0.000,2.135,6.327
C(pair_all)[T.17002.0],2.1128,4.990,0.423,0.672,-7.667,11.893
C(pair_all)[T.17003.0],-2.0471,3.675,-0.557,0.578,-9.250,5.156
C(pair_all)[T.17004.0],-0.0948,3.492,-0.027,0.978,-6.939,6.749
C(pair_all)[T.17005.0],0.4167,5.842,0.071,0.943,-11.033,11.867
C(pair_all)[T.17006.0],2.3491,2.457,0.956,0.339,-2.466,7.164
C(pair_all)[T.17007.0],2.4467,2.034,1.203,0.229,-1.539,6.433
C(pair_all)[T.17008.0],-5.0975,6.284,-0.811,0.417,-17.413,7.218

0,1,2,3
Omnibus:,218.895,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,153.195
Skew:,0.106,Prob(JB):,5.42e-34
Kurtosis:,2.614,Cond. No.,88.5


In [27]:
pf = pd.read_stata(panel_final)
incl_vars_ctrl = [
    "vl_proficiencia_fup",
    "vl_proficiencia_bl",
    "treatment",
    "cd_escola",
    "pair_all",
    "female_coded"  ##, "p_employee_fup", "p_selfempl_fup",
]
data = (
    pf[incl_vars_ctrl][(pf["round"] == "no")].reset_index().drop(
                           "index", axis=1))

data["female_coded"] = data["female_coded"].map({
    "yes": 1,
    "no": 0,
    "missing": 9
}).astype("float")

data["treatment"] = data["treatment"].map({"yes": 1, "no": 0}).astype("float")

flag_temp = pd.DataFrame(
    data.groupby("pair_all")["treatment"].mean().reset_index().rename(
        {"treatment": "flag"}, axis=1))
data = data.merge(flag_temp, how="left", on="pair_all")

data["miss_vl_proficiencia_bl"] = np.where(
    data["vl_proficiencia_bl"].isna() == True, 1, 0)
data["vl_proficiencia_bl"] = data["miss_vl_proficiencia_bl"].fillna(0)

pairall = []
for par, flg in zip(data["pair_all"], data["flag"]):
    if (flg > 0) & (flg < 1):
        pairall.append(par)
    elif flg == 0:
        pairall.append(0)
    elif flg == 1:
        pairall.append(0)
    else:
        pairall.append(0)
data["pair_all"] = pairall

data = data.dropna().reset_index().drop("index", axis=1)
t3_c = smf.ols(
    "vl_proficiencia_fup~treatment+C(pair_all)+female_coded+vl_proficiencia_bl+miss_vl_proficiencia_bl",
    data=data)
t3_c_res = t3_c.fit(cov_type='cluster', cov_kwds={'groups': data["cd_escola"]})
display(t3_c_res.summary())

0,1,2,3
Dep. Variable:,vl_proficiencia_fup,R-squared:,0.204
Model:,OLS,Adj. R-squared:,0.185
Method:,Least Squares,F-statistic:,0.1307
Date:,"Mon, 17 Jul 2023",Prob (F-statistic):,0.877
Time:,16:47:41,Log-Likelihood:,-73005.0
No. Observations:,18276,AIC:,146900.0
Df Residuals:,17843,BIC:,150300.0
Df Model:,432,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,53.3332,1.649,32.347,0.000,50.102,56.565
C(pair_all)[T.17001.0],1.5004,1.711,0.877,0.380,-1.852,4.853
C(pair_all)[T.17002.0],0.8434,2.792,0.302,0.763,-4.629,6.315
C(pair_all)[T.17003.0],0.2494,4.158,0.060,0.952,-7.901,8.399
C(pair_all)[T.17004.0],0.4954,3.167,0.156,0.876,-5.712,6.703
C(pair_all)[T.17005.0],-2.5060,2.314,-1.083,0.279,-7.041,2.029
C(pair_all)[T.17006.0],-1.8912,4.189,-0.451,0.652,-10.102,6.319
C(pair_all)[T.17007.0],3.6646,2.093,1.751,0.080,-0.438,7.767
C(pair_all)[T.17008.0],1.8096,1.645,1.100,0.271,-1.415,5.035

0,1,2,3
Omnibus:,130.53,Durbin-Watson:,1.968
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96.384
Skew:,0.075,Prob(JB):,1.18e-21
Kurtosis:,2.678,Cond. No.,494000000000000.0
