In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from plotnine import *

### Diff in Diff Plot Function

In [2]:
def diff_in_diff_plot(df_treatment, df_control, policy_implementation_year, resp_variable, file_name):
    
    df_treatment['standardized_year'] = df_treatment['Year'] - policy_implementation_year
    df_control['standardized_year'] = df_control['Year'] - policy_implementation_year
    
    lower_lim = df_treatment['standardized_year'].min()
    upper_lim = df_treatment['standardized_year'].max()+1
    
    plot = (
        ggplot() +
        geom_smooth(df_treatment[df_treatment['standardized_year'] < 0], 
                    aes(x='standardized_year', y=resp_variable, color='treatment'), method='lm') + 
        geom_smooth(df_treatment[df_treatment['standardized_year'] >= 0], 
                    aes(x='standardized_year', y=resp_variable, color='treatment'), method='lm') + 

        geom_smooth(df_control[df_control['standardized_year'] < 0], 
                    aes(x='standardized_year', y=resp_variable, color='treatment'), method='lm') + 
        geom_smooth(df_control[df_control['standardized_year'] >= 0], 
                    aes(x='standardized_year', y=resp_variable, color='treatment'), method='lm')
        + geom_vline(xintercept = 0) 
        + xlab('Years before/after Policy Implementation Year: '+str(policy_implementation_year) 
        + '. \nRepresented as "0" on the x-axis.') 
        + ylab(str('Unemployment Rate') + ' \n(Adding 95% confidence interval)')
        + scale_x_continuous(breaks=range(lower_lim,upper_lim,1))
        + labs(title=str("Difference in Difference Plot "))

    )
    
    # Save plot to images folder.
    ggsave(filename=str('../20_Transformed_Source_Data/images/'+file_name),
           plot=plot,
           dpi=150,
           height=8,
           width=8,
           verbose = False)

    
    return plot


### Pre-Post Plot Function

In [3]:
def pre_post_plot(df_treatment, policy_implementation_year, resp_variable, file_name):
    df_treatment['standardized_year'] = df_treatment['Year'] - policy_implementation_year
    lower_lim = df_treatment['standardized_year'].min()
    upper_lim = df_treatment['standardized_year'].max()+1
    plot = (
    ggplot() +
    geom_smooth(df_treatment[df_treatment['standardized_year'] < 0], 
                aes(x='standardized_year', y=resp_variable, color='treatment'), method='lm') + 
    geom_smooth(df_treatment[df_treatment['standardized_year'] >= 0], 
                aes(x='standardized_year', y=resp_variable, color='treatment'), method='lm')
    + geom_vline(xintercept = 0) 
    + xlab('Years before/after Policy Implementation Year: '+str(policy_implementation_year) 
    + '. \nRepresented as "0" on the x-axis.') 
    + ylab(str(resp_variable) + ' \n(Adding 95% confidence interval)')
    + scale_x_continuous(breaks=range(lower_lim,upper_lim,1))
    + labs(title=str("Pre Post Plot"))
    )
    # Save plot to images folder.
    ggsave(filename=str('../20_Transformed_Source_Data/images/pre_post_'+file_name),
           plot=plot,
           dpi=150,
           height=8,
           width=8,
           verbose = False)
    return plot

### Load dataset

In [4]:
df = pd.read_csv('../20_Transformed_Source_Data/final_dataset.csv')
df = df.drop('Unnamed: 0', axis=1)
df['treatment'] = pd.Categorical(df['treatment'])

plot_df_treated = df[df['treatment'] == 1].copy()
plot_df_control = df[df['treatment'] == 0].copy()

### Pre-Post: Plot UnemploymentRate vs Time

In [5]:
# Min wages between treatment and control split in 2015.
policy_implementation_year = 2015
resp_variable = 'Unemployed_Rate'
file_name = 'UnemploymentRate_vs_Time_Pre_Post'
plot = pre_post_plot(plot_df_treated, policy_implementation_year, resp_variable, file_name)
plot

FileNotFoundError: [Errno 2] No such file or directory: '../20_intermediate_files/images/pre_post_UnemploymentRate_vs_Time_Pre_Post.png'

### Diff-in-diff: Plot UnemploymentRate vs Time

In [None]:
# Min wages between treatment and control split in 2015.
policy_implementation_year = 2015
resp_variable = 'Unemployed_Rate'
file_name = 'UnemploymentRate_vs_Time'
plot = diff_in_diff_plot(plot_df_treated, plot_df_control, policy_implementation_year, resp_variable, file_name)
plot

### Difference in Difference Calculation

In [None]:
# Separate the treatement group into pre-treated and post-treated subsets

df_treated=df[df['treatment']==1]

pre_treated=df_treated.loc[df_treated['Year'].isin([2010,2011,2012,2013,2014]),['Unemployed_Rate','Year']]
post_treated=df_treated.loc[df_treated['Year'].isin([2016,2017,2018,2019]),['Unemployed_Rate','Year']]

# Calculate the mean value for the pre-treated subset

pre_treat_mean = np.mean(pre_treated['Unemployed_Rate'])
print(pre_treat_mean)

# Calculate the mean value for the post-treated subset

post_treat_mean = np.mean(post_treated['Unemployed_Rate'])
print(post_treat_mean)


In [None]:
# Separate the control group into pre-treated and post-treated subsets

df_control=df[df['treatment']==0]

pre_control=df_control.loc[df_control['Year'].isin([2010,2011,2012,2013,2014]),['Unemployed_Rate','Year']]
post_control=df_control.loc[df_control['Year'].isin([2016,2017,2018,2019]),['Unemployed_Rate','Year']]

# Calculate the mean value for the pre-treated subset

pre_control_mean = np.mean(pre_control['Unemployed_Rate'])
print(pre_control_mean)

# Calculate the mean value for the post-treated subset

post_control_mean = np.mean(post_control['Unemployed_Rate'])
print(post_control_mean)


In [None]:
# Calculate the value for Difference in Difference

DD = (post_treat_mean - pre_treat_mean) - (post_control_mean - pre_control_mean)
print("DD", DD)

# Regression Analysis

### State Fixed Effects

In [None]:
#Add pre/post
pre = df[np.logical_and(df['Year'] >= 2010,df['Year'] <= 2014)].copy()
pre['Post'] = 0
post = df[np.logical_and(df['Year'] >= 2016,df['Year'] <= 2019)].copy()
post['Post'] = 1 

fe_df = pd.concat([pre, post])
fe_df.head()

In [None]:
model_fe = smf.ols('Unemployed_Rate ~ C(treatment) + C(Post) + C(treatment)*C(Post) + C(State) ', data=fe_df).fit()
model_fe.summary()