In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Example: Instrumental Variables Estimation of the Effect of Social Spillovers on Movie-going

This notebook will illustrate the entire supervised machine learning process in the context of predicting movie attendance based on the weather on opening weekend.

### Figure out your question

What is the effect of opening-weekend attendance on subsequent weekend attendance at a movie?

## Obtain a labeled dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
moviedata=pd.read_csv('/content/gdrive/My Drive/Econ 484/datasets/movies_cleaned.csv')
print(moviedata.head())
print("Shape: {}".format(str(moviedata.shape)))

   x_openingsales  open_res_own_snow_6  ...  open_res_own_prec_5_0  y_ticketsales
0        0.441295             0.035468  ...               0.068204       0.231157
1        2.056574             0.001406  ...              -0.024064       1.293781
2        1.516240             0.003928  ...              -0.016685       0.973964
3        5.888784             0.104670  ...               0.058988       3.219681
4        3.570493             0.071977  ...              -0.003570       2.345072

[5 rows x 54 columns]
Shape: (1671, 54)


Let's define our "label" (y) vector, our "treatment" vector (d), and our instrument matrix (Z):

In [None]:
y = moviedata.loc[:,'y_ticketsales']
d = moviedata.loc[:,['x_openingsales']]
Z = moviedata.filter(like='open_',axis=1)
print('our y vector is:\n',y.head)
print('our d vector is:\n',d.head)
print('our instrument matrix is:\n',Z.head)

our y vector is:
 <bound method NDFrame.head of 0       0.231157
1       1.293781
2       0.973964
3       3.219681
4       2.345072
          ...   
1666    3.459683
1667    3.373921
1668    1.825269
1669    1.152968
1670    1.050490
Name: y_ticketsales, Length: 1671, dtype: float64>
our d vector is:
 <bound method NDFrame.head of       x_openingsales
0           0.441295
1           2.056574
2           1.516240
3           5.888784
4           3.570493
...              ...
1666        9.754463
1667        6.293130
1668        2.908898
1669        1.672062
1670        3.026325

[1671 rows x 1 columns]>
our instrument matrix is:
 <bound method NDFrame.head of       open_res_own_snow_6  ...  open_res_own_prec_5_0
0                0.035468  ...               0.068204
1                0.001406  ...              -0.024064
2                0.003928  ...              -0.016685
3                0.104670  ...               0.058988
4                0.071977  ...              -0.003570
...    

## Start with OLS of y on d. Be sure to print import necessary packages and print out coefficients!

### Try yourself first!

In [None]:
from sklearn import linear_model

ols = linear_model.LinearRegression()
ols_reg = ols.fit(d,y)
print('OLS coefficient: ',ols_reg.coef_)

OLS coefficient:  [0.4712296]


### Cheat if you need to

In [None]:
from sklearn import linear_model

ols = linear_model.LinearRegression()
ols_reg = ols.fit(d,y)
print('OLS coefficient: ',ols_reg.coef_)

## Now do "manual" two-stage least squares where you first regress d on Z, obtain predicted values, then regress y on the predicted values. Be sure to print out final coefficient on d-hat!

### Try yourself first

In [None]:
first_stage = linear_model.LinearRegression().fit(Z,d)
d_hat = first_stage.predict(Z)
second_stage = linear_model.LinearRegression().fit(d_hat,y)
print(second_stage.coef_)

[0.55182418]


### Cheat if you need to

In [None]:
ols_fs = ols.fit(Z,d)
dhat = ols_fs.predict(Z)
tsls = ols.fit(dhat,y)
print('2SLS coefficient: ',tsls.coef_)


2SLS coefficient:  [0.55182418]


## Now do ML-augmented two-stage least squares using Random Forest to obtain the fitted values

###Try yourself first

In [None]:
# import necessary packages and create prediction "object"
# first grow random forest: create d-hat that is less overfit as ols
####EXPLANATION OF WHY THIS WORKS CORRECTLY in MY WRITTEN NOTES
from sklearn.ensemble import RandomForestRegressor
tree = RandomForestRegressor(max_depth=2, max_features='sqrt').fit(Z,d)

#Then use d_hat in 2sls. so then you get a new d_hat ols and regress it on y
# now get random forest predictions to use as instrument:
predictions = tree.predict(Z)
print(predictions)
# do "first stage" using random forest predictions as instrument:
fstage = linear_model.LinearRegression().fit(np.reshape(predictions,(-1,1)),d)
fstage_hat = fstage.predict(np.reshape(predictions,(-1,1)))
# finally, 2nd stage regression:
second = linear_model.LinearRegression().fit(fstage_hat,y)
print(second.coef_)

  after removing the cwd from sys.path.


[2.48262527 2.43507865 2.55821101 ... 2.41848227 2.41848227 2.41848227]
[0.44975107]


### Cheat

In [None]:
# import necessary packages and create prediction "object"
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=2,max_features='sqrt')
# first grow random forest:
rf_fs=rf.fit(Z,np.ravel(d))
# now get random forest predictions to use as instrument:
iv_rf=np.reshape(rf_fs.predict(Z),(-1,1))
# do "first stage" using random forest predictions as instrument:
fs_rf=ols.fit(iv_rf,d)
dhat_rf=fs_rf.predict(iv_rf)
# finally, 2nd stage regression:
tsls_rf=ols.fit(np.reshape(dhat_rf,(-1,1)),y)
print('2SLS+Random Forest coefficient: ',tsls_rf.coef_)

##Now do Belloni, Chernozhukov, Hansen Post-Lasso 2SLS

### Try yourself first

In [None]:
# hint: to select the columns of a matrix corresponding to a set of nonzero coefficients, you can do something like:
#don't forget to scale the z's before doing lasso
# Z_selected = Z[:,model.coef_!=0]

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #create scaler object
scaler.fit(Z) #feed the scaler object the x
z_scaled = scaler.transform(Z)


lasso = linear_model.Lasso(alpha=0.1)
lasso_model = lasso.fit(z_scaled,y)
print(lasso_model.coef_)

##Unfinished but follow process in notes. THis is cool

[ 0.          0.         -0.         -0.         -0.         -0.
  0.         -0.          0.          0.          0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -0.         -0.
 -0.         -0.00460241 -0.         -0.          0.         -0.
  0.          0.          0.          0.          0.          0.
 -0.         -0.         -0.          0.         -0.         -0.
  0.         -0.         -0.         -0.        ]


### Cheat

In [None]:
# Lasso tends to work better with standardized variables, so:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Z)
Z_scaled = scaler.transform(Z)

# create lasso object, setting the penalty parameter
lasso=linear_model.Lasso(alpha=.1)

# predict d using Z_scaled:
lasso.fit(Z_scaled,d)

# grab just the Zs with nonzero coeffs
Z_selected=Z_scaled[:,lasso.coef_!=0]

# do the first stage regression via OLS using the selected Zs and get the fitted values:
postlasso_fs = ols.fit(Z_selected,d)
dhat_postlasso = postlasso_fs.predict(Z_selected)

# do 2nd stage regression using the post-lasso fitted values:
tsls_postlasso = ols.fit(dhat_postlasso,y)
print('Post-Lasso 2SLS coefficient: ',tsls_postlasso.coef_)


## Now go back to ML-augmented 2SLS and try with several different prediction methods