In [1]:
from differences import ATTgt
from matplotlib import pyplot as plt
plt.rcParams["figure.dpi"] = 300
plt.rcParams['savefig.dpi'] = 300
from panel_utilities import get_value_variable_names, convert_weekly_panel_to_biweekly_panel, prepare_df_for_DiD
from stats_utilities import select_controls, test_balance, add_missing_indicators
from figure_utilities import aggregate_by_event_time_and_plot, aggregate_by_time_and_plot
import pandas as pd



# Store paths.
INPUT_DATA = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/data/03_cleaned/crime_analysis_weekly.parquet"
OUTPUT_FIGURES = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/output/group_0_crimes_100m/figures"
OUTPUT_TABLES = "/Users/arjunshanmugam/Documents/GitHub/seniorthesis/output/group_0_crimes_100m/tables"


# Read restricted sample panel dataset into memory.
df = pd.read_parquet(INPUT_DATA)

analysis='group_0_crimes_100m'

In [2]:
# Convert weekly panel to biweekly panel.
df = convert_weekly_panel_to_biweekly_panel(df, analysis)

# Generate value vars list and month to int dictionaries.
weekly_value_vars_crime, month_to_int_dictionary, int_to_month_dictionary  = get_value_variable_names(df, analysis)

In [3]:
covariates_exploration_df = select_controls(df=df, analysis=analysis, output_directory=OUTPUT_TABLES)
covariates_exploration_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Dependent Variable,Dependent Variable
Unnamed: 0_level_1,\emph{Independent Variable},Unnamed: 2_level_1,Unnamed: 3_level_1
Panel A: Pre-treatment Outcomes,total_twenty_seventeen_group_0_crimes_100m,9.074854e-279,0.05936057
Panel A: Pre-treatment Outcomes,pre_treatment_change_in_group_0_crimes_100m,0.1340638,0.7528347
Panel B: Census Tract Characteristics,frac_coll_plus2010,0.00458933,0.778286
Panel B: Census Tract Characteristics,job_density_2013,7.068208e-28,0.5178933
Panel B: Census Tract Characteristics,med_hhinc2016,1.727645e-10,0.1224981
Panel B: Census Tract Characteristics,poor_share2010,1.275324e-29,0.5297634
Panel B: Census Tract Characteristics,popdensity2010,3.908056e-07,0.006607443
Panel C: Case Initiation,for_cause,0.009106297,1.764482e-08
Panel C: Case Initiation,no_cause,0.3263339,0.8995981
Panel C: Case Initiation,non_payment,0.00375471,4.059912e-07


In [4]:
balance_table, pre_treatment_covariates = test_balance(df, analysis, covariates_exploration_df, OUTPUT_TABLES)
balance_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Difference in Cases Won by Defendant,Difference in Cases Won by Defendant,Difference in Cases Won by Defendant,Difference in Cases Won by Defendant
Unnamed: 0_level_1,Unnamed: 1_level_1,Cases Won by Plaintiff,Unweighted,\emph{p},Weighted,\emph{p}.1
Panel A,total_twenty_seventeen_group_0_crimes_100m,69.220357,4.371557,0.05936057,-0.289097,0.89915
Panel B,frac_coll_plus2010,0.322882,0.002115,0.778286,-0.001299,0.864091
Panel B,job_density_2013,16069.137185,926.908698,0.5178933,-66.872914,0.963087
Panel B,med_hhinc2016,48035.651626,1312.556234,0.1224981,-195.931583,0.815865
Panel B,poor_share2010,0.273292,-0.003079,0.5297634,-0.001086,0.826329
Panel B,popdensity2010,23086.043553,1225.145069,0.006607443,-95.784138,0.823551
Panel C,for_cause,0.165268,0.065981,1.764482e-08,-0.000842,0.925021
Panel C,non_payment,0.781742,-0.067707,4.059912e-07,-0.002946,0.792913
Panel D,hasAttyP,0.879853,-0.035786,0.0006226457,-0.003426,0.728865
Panel D,isEntityD,0.014166,-0.011791,0.01265289,-2.5e-05,0.995716


In [5]:
# Generate indicators for missing data.

# missing_indicators = add_missing_indicators(df, ['rent_twobed2015'], pre_treatment_covariates)

# Prepare df for use with the differences package.
df = df.reset_index()
df = prepare_df_for_DiD(df=df,
                        analysis=analysis,
                        treatment_date_variable='file_week',
                        pre_treatment_covariates=pre_treatment_covariates,
                        missing_indicators=[],
                        value_vars=weekly_value_vars_crime,
                        period_to_int_dictionary=month_to_int_dictionary)

In [6]:
# Run DiD without conditioning on covariates.
att_gt = ATTgt(data=df, cohort_name='file_week', freq='2W', base_period='varying')
result = att_gt.fit(formula=analysis, control_group='never_treated', n_jobs=-1)

Computing ATTgt [workers=10]  100%|████████████████████| 9648/9648 [03:15<00:00, 49.46it/s] 


In [None]:
# Plot unconditional ATT(t-g)s on a long horizon.
aggregate_by_event_time_and_plot(att_gt, OUTPUT_FIGURES, "att_gt_unconditional_event_study_long_horizon.png",
                                 start_period=-6, end_period=12,
                                 title="Unconditional Estimates of ATT(g, t), Aggregated by Month Relative to Treatment \n (Long Horizon)",
                                 treatment_month_variable='file_week', df=df)

In [None]:
# Plot unconditional ATT(t)s.
aggregate_by_time_and_plot(att_gt, int_to_month_dictionary, OUTPUT_FIGURES, "att_gt_unconditional_time.png",
                           title="Unconditional Estimates of ATT(g, t), Aggregated by Month")

In [12]:
# Run DiD conditional on covariates.
att_gt = ATTgt(data=df, cohort_name='file_week', freq='M', base_period='varying')
formula = f'{analysis} ~ ' + '+'.join(pre_treatment_covariates)
result = att_gt.fit(formula=formula, control_group='never_treated', n_jobs=-1, progress_bar=True)

  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-

In [None]:
# Plot D.R. ATT(t-g)s on a long horizon.
aggregate_by_event_time_and_plot(att_gt, OUTPUT_FIGURES, "att_gt_dr_event_study_long_horizon.png", start_period=-3,
                                 end_period=20,
                                 title="D.R. Estimates of ATT(g, t), Aggregated by Month Relative to Treatment \n (Long Horizon)",
                                 treatment_month_variable='file_week', df=df)

In [None]:
# Plot D.R. ATT(t)s.
aggregate_by_time_and_plot(att_gt, int_to_month_dictionary, OUTPUT_FIGURES, "att_gt_dr_time.png",
                           title="D.R. Estimates of ATT(g, t), Aggregated by Month")