## Welcome

This is material for the **Synthetic Control** chapter in Scott Cunningham's book, [Causal Inference: The Mixtape.](https://mixtape.scunning.com/)



In [None]:
# This cell installs some utility packages. We utilize R via rpy2.

from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter
utils = importr('utils')
utils.install_packages('Synth')

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

from rpy2 import robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.vectors import IntVector

import plotnine as p
import matplotlib.pyplot as plt

%matplotlib inline



In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

def read_data(file):
    full_path = "https://raw.github.com/scunning1975/mixtape/master/" + file
    
    return pd.read_stata(full_path)

## Prison Construction and Black Male Incarceration


In [None]:
texas = read_data("texas.dta")

In [None]:
Synth = importr('Synth')

In [None]:
control_units = [1, 2, 4, 5, 6] +\
    list(range(8, 14)) + list(range(15,43)) +\
    list(range(44, 47)) + [49, 50, 51, 53,54,55,56]

robjects.globalenv['texas'] = texas

predictors = robjects.vectors.StrVector(['poverty', 'income'])
sp = robjects.vectors.ListVector({'1': ['bmprison', IntVector([1988, 1990, 1991, 1992]), 'mean'], 
                                  '2': ['alcohol', 1990, 'mean'], 
                                  '3': ['aidscapita', IntVector([1990, 1991]), 'mean'], 
                                  '4': ['black', IntVector([1990, 1991, 1992]), 'mean'], 
                                  '5': ['perc1519', 1990, 'mean']})

dataprep_out = Synth.dataprep(texas, 
    predictors = predictors,
    predictors_op="mean",
    time_predictors_prior=np.arange(1985, 1994),
    special_predictors=sp,
    dependent='bmprison',
    unit_variable='statefip',
    unit_names_variable='state',
    time_variable='year',
    treatment_identifier=48,
    controls_identifier=control_units,
    time_optimize_ssr=np.arange(1985, 1994),
    time_plot=np.arange(1985, 2001))

synth_out = Synth.synth(data_prep_obj = dataprep_out)

In [None]:
weights = synth_out.rx['solution.w'][0]
ct_weights = pd.DataFrame({'ct_weights':weights.flatten(), 'statefip':control_units})
ct_weights.head()

In [None]:
texas = pd.merge(ct_weights, texas, how='right', on='statefip')

In [None]:
texas = texas.sort_values('year')
ct = texas.groupby('year').apply(lambda x : np.sum(x['ct_weights']*x['bmprison']))
treated = texas[texas.statefip==48]['bmprison'].values
years = texas.year.unique()


plt.plot(years, ct, linestyle='--', color='black', label='control')
plt.plot(years, treated, linestyle='-', color='black', label='treated')
plt.legend()
plt.ylabel('bmprison')
plt.xlabel('Time')
plt.title('Synthetic Control Performance')

In [None]:
ct_diff = treated - ct

plt.plot(years, np.zeros(len(years)), linestyle='--', color='black', label='control')
plt.plot(years, ct_diff, linestyle='-', color='black', label='treated')
plt.legend()
plt.ylabel('bmprison')
plt.xlabel('Time')
plt.title('Treated - Control')


#### Questions
- In your own words, what do you think the identifying assumptions are for synthetic control to be consistent? 
- What role, if any, does parallel trends play in synthetic control?
- Who is the unit with the largest ratio of post to pre RMSPE?  
- Compare the unit with the largest post to pre RMSPE estimated effect to the Texas effect.  How do the weights compare?  How do the size of the effects compare?  How do the ``signs`` of the effects compare?
- Can you improve on my fit by experimenting with different combinations? Do so and report your analysis.
- Report results from a variety of different specifications.  How robust does the prison effect appear to be?

