# Workgroup 3

**Group 3**: Valerie Dube, Erzo Garay, Juan Marcos Guerrero y Matías Villalba,

## 1. Neyman Orthogonality Proof

## 2. Code Section

### 2.1. Orthogonal Learning

### 2.2. Double Lasso - Using School data

In [151]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from stargazer.stargazer import Stargazer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

#### 2.2.1. Preprocessing data

In [152]:
# Read csv file
df = pd.read_csv('./data/bruhn2016.csv', delimiter=',')

In [153]:
df.head()

Unnamed: 0,outcome.test.score,treatment,school,is.female,mother.attended.secondary.school,father.attened.secondary.school,failed.at.least.one.school.year,family.receives.cash.transfer,has.computer.with.internet.at.home,is.unemployed,has.some.form.of.income,saves.money.for.future.purchases,intention.to.save.index,makes.list.of.expenses.every.month,negotiates.prices.or.payment.methods,financial.autonomy.index
0,47.367374,0,17018390,,,,,,,1.0,1.0,0.0,29.0,0.0,1.0,52.0
1,58.176758,1,33002614,,,,,,,0.0,0.0,0.0,41.0,0.0,0.0,27.0
2,56.671661,1,35002914,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,48.0,0.0,1.0,56.0
3,29.079376,0,35908915,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,0.0,0.0,27.0
4,49.563534,1,33047324,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,50.0,0.0,1.0,31.0


In [154]:
# Drop missing values, we lose 5077 values (from 17299 to 12222 rows)
df.dropna(axis=0, inplace=True)
df.reset_index(inplace=True ,drop=True)

In [155]:
df.columns

Index(['outcome.test.score', 'treatment', 'school', 'is.female',
       'mother.attended.secondary.school', 'father.attened.secondary.school',
       'failed.at.least.one.school.year', 'family.receives.cash.transfer',
       'has.computer.with.internet.at.home', 'is.unemployed',
       'has.some.form.of.income', 'saves.money.for.future.purchases',
       'intention.to.save.index', 'makes.list.of.expenses.every.month',
       'negotiates.prices.or.payment.methods', 'financial.autonomy.index'],
      dtype='object')

In [156]:
dependent_vars = ['outcome.test.score', 'intention.to.save.index', 'negotiates.prices.or.payment.methods', 'has.some.form.of.income', 'makes.list.of.expenses.every.month', 'financial.autonomy.index', 'saves.money.for.future.purchases', 'is.unemployed']

For Lasso regressions, we split the data into train and test data, and standarize the covariates matrix

In [157]:
# Train test split
X = df.drop(dependent_vars, axis = 1)
y = df[dependent_vars]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [158]:
T_train = X_train['treatment']
T_test = X_test['treatment']

X_train = X_train.drop(['treatment'], axis = 1)
X_test = X_test.drop(['treatment'], axis = 1)


In [159]:
# Standarize X data
scale = StandardScaler()

X_train_scaled = pd.DataFrame(scale.fit_transform(X_train), index=X_train.index)
X_test_scaled = pd.DataFrame(scale.transform(X_test), index=X_test.index)

In [160]:
X_scaled = pd.concat([X_train_scaled, X_test_scaled]).sort_index()
T = pd.concat([T_train, T_test]).sort_index()

#### 2.2.2. Regressions

##### a. OLS

From 1 - 3 regression: measures treatment impact on **student financial proficiency**

From 4 - 6 regression: measures treatment impact on **student savings behavior and attitudes**

From 7 - 9 regression: measures treatment impact on **student money management behavior and attitudes**

From 10 - 12 regression: measures treatment impact on **student entrepreneurship and work outcomes**

In [161]:
# Rgeressions with "Student Financial Proficiency" as dependet variable
ols_score_1      = sm.OLS.from_formula('Q("outcome.test.score") ~ treatment', data=df).fit()
ols_score_2      = sm.OLS.from_formula('Q("outcome.test.score") ~ treatment + school + Q("failed.at.least.one.school.year")', data=df).fit()
ols_score_3      = sm.OLS.from_formula('Q("outcome.test.score") ~ treatment + school + Q("failed.at.least.one.school.year") + Q("is.female") + Q("mother.attended.secondary.school")', data=df).fit()

# Rgeressions with "Intention to save index" as dependet variable
ols_saving_1     = sm.OLS.from_formula('Q("intention.to.save.index") ~ treatment', data=df).fit()
ols_saving_2     = sm.OLS.from_formula('Q("intention.to.save.index") ~ treatment + school + Q("failed.at.least.one.school.year")', data=df).fit()
ols_saving_3     = sm.OLS.from_formula('Q("intention.to.save.index") ~ treatment + school + Q("failed.at.least.one.school.year") + Q("is.female") + Q("mother.attended.secondary.school")', data=df).fit()

# Rgeressions with "Negotiates prices or payment methods" as dependet variable
ols_negotiates_1 = sm.OLS.from_formula('Q("negotiates.prices.or.payment.methods") ~ treatment', data=df).fit()
ols_negotiates_2 = sm.OLS.from_formula('Q("negotiates.prices.or.payment.methods") ~ treatment + school + Q("failed.at.least.one.school.year")', data=df).fit()
ols_negotiates_3 = sm.OLS.from_formula('Q("negotiates.prices.or.payment.methods") ~ treatment + school + Q("failed.at.least.one.school.year") + Q("is.female") + Q("mother.attended.secondary.school")', data=df).fit()

# Rgeressions with "Has some form of income" as dependet variable
ols_manage_1     = sm.OLS.from_formula('Q("has.some.form.of.income") ~ treatment', data=df).fit()
ols_manage_2     = sm.OLS.from_formula('Q("has.some.form.of.income") ~ treatment + school + Q("failed.at.least.one.school.year")', data=df).fit()
ols_manage_3     = sm.OLS.from_formula('Q("has.some.form.of.income") ~ treatment + school + Q("failed.at.least.one.school.year") + Q("is.female") + Q("mother.attended.secondary.school")', data=df).fit()

# Show parameters in table
st = Stargazer([ols_score_1, ols_score_2, ols_score_3, ols_saving_1, ols_saving_2, ols_saving_3, ols_negotiates_1, ols_negotiates_2, ols_negotiates_3, ols_manage_1, ols_manage_2, ols_manage_3])
st.custom_columns(["Dependent var 1: Student Financial Proficiency", "Dependent var 2: Intention to save index", "Dependent var 3: Negotiates prices or payment methods", "Dependent var 4: Has some form of income"], [3, 3, 3, 3])
st.rename_covariates({'Q("failed.at.least.one.school.year")': 'Failed at least one school year', 'Q("is.female")': 'Female'})
st



0,1,2,3,4,5,6,7,8,9,10,11,12
,,,,,,,,,,,,
,,,,,,,,,,,,
,Dependent var 1: Student Financial Proficiency,Dependent var 1: Student Financial Proficiency,Dependent var 1: Student Financial Proficiency,Dependent var 2: Intention to save index,Dependent var 2: Intention to save index,Dependent var 2: Intention to save index,Dependent var 3: Negotiates prices or payment methods,Dependent var 3: Negotiates prices or payment methods,Dependent var 3: Negotiates prices or payment methods,Dependent var 4: Has some form of income,Dependent var 4: Has some form of income,Dependent var 4: Has some form of income
,(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11),(12)
,,,,,,,,,,,,
Intercept,57.591***,59.377***,57.013***,49.016***,46.725***,45.407***,0.763***,0.856***,0.901***,0.639***,0.534***,0.553***
,(0.187),(0.556),(0.583),(0.240),(0.728),(0.767),(0.006),(0.017),(0.018),(0.006),(0.019),(0.020)
Failed at least one school year,,-7.218***,-6.759***,,-3.614***,-3.354***,,0.024***,0.016*,,0.005,0.002
,,(0.288),(0.289),,(0.377),(0.380),,(0.009),(0.009),,(0.010),(0.010)
Female,,,2.836***,,,1.357***,,,-0.066***,,,-0.054***


##### b. Double Lasso using cross validation

Dependent var 1: Student Financial Proficiency

Step 1: We ran Lasso regression of Y (student financial proficiency) on X, and T on X

In [162]:
lasso_CV_yX = LassoCV(alphas = np.arange(0.0001, 0.5, 0.001), cv = 10, max_iter = 5000)
lasso_CV_yX.fit(X_train_scaled, y_train['outcome.test.score'])

lasso_CV_lambda = lasso_CV_yX.alpha_
print(f"Mejor lambda: {lasso_CV_lambda:.4f}")

Mejor lambda: 0.0001


In [163]:
# Estimate y predictions with all X
y_pred_yX = lasso_CV_yX.predict(X_scaled)

In [164]:
lasso_CV_TX = LassoCV(alphas = np.arange(0.0001, 0.5, 0.001), cv = 10, max_iter = 5000)
lasso_CV_TX.fit(X_train_scaled, T_train)
y_pred = lasso_CV_TX.predict(X_test_scaled)

lasso_CV_lambda = lasso_CV_TX.alpha_
print(f"Mejor lambda: {lasso_CV_lambda:.4f}")

Mejor lambda: 0.0011


In [165]:
# Estimate T predictions with all X
y_pred_TX = lasso_CV_TX.predict(X_scaled)

Step 2: Obtain the resulting residuals

In [170]:
res_yX = y['outcome.test.score'] - y_pred_yX
res_TX = T - y_pred_TX

Step 3: We run the least squares of res_yX on res_TX

In [172]:
ols_score = sm.OLS.from_formula('res_yX ~ res_TX', data=df).fit()

# Show parameters in table
st = Stargazer([ols_score])
st



0,1
,
,Dependent variable: res_yX
,
,(1)
,
Intercept,0.033
,(0.126)
res_TX,4.324***
,(0.253)
Observations,12222


##### c. Double Lasso using theoretical lambda

##### d. Double Lasso using partialling out method

#### Results

We know that p/n is small (8/12222 = 0.00065455735)