In [19]:
## Load packages and data
import pandas as pd
import numpy as np
from plotnine import *
from scipy import stats
from dateutil.relativedelta import relativedelta

import statsmodels.formula.api as sm
import statsmodels.stats.sandwich_covariance as sw
import statsmodels as statsmodels

%store -r merged_date_cleaned_new
%store -r bw_only
%store -r mf_only

merged = merged_date_cleaned_new.copy()

In [20]:
## Function that:
# subsets data into bandwidths and creates indicator
# runs regression for target group
# compile regression results

def subset_data(df_input, cutoff, bw_length):
    cutoff = pd.to_datetime(cutoff)
    df_merged = df_input.copy()
    
    ## Set bandwidth range
    end = cutoff + relativedelta(days=+bw_length)
    start = cutoff + relativedelta(days=-bw_length)
    window_range = pd.date_range(start=start,end=end)
    
    ## Subset to within range
    df = df_merged[df_merged['date_reviewed'].isin(window_range) & ~(df_merged['date_reviewed']==cutoff)].copy()
    df['is_after'] = df['date_reviewed']>cutoff
    
    ## Create two dataframes to compare
    df_b = df[df['is_after']==False]['Local ID'].reset_index()
    df_a = df[df['is_after']==True]['Local ID'].reset_index()
    before_list = pd.DataFrame(df_b['Local ID'].unique())
    after_list = pd.DataFrame(df_a['Local ID'].unique())
    
    ## Merge to see which IDs are in both datasets
    print(before_list.shape) # Merge diagnostics
    print(after_list.shape)
    one_list = before_list.merge(after_list, how = "outer", indicator = True, left_on = 0, right_on = 0)
    print(one_list.shape)
    one_list = one_list[~(one_list['_merge']=="both")]
    print(one_list.shape) # Merge diagnostics

    ## Keep only the IDs in only one of the two sets (before or after) to avoid IDs in both impacting the results
    df_either = df[df['Local ID'].isin(one_list[0])].copy()
    df_either['is_after'] = df_either['date_reviewed']>cutoff
    
    ## Save for visualization
    df_either['bw'] = bw_length
    
    return(df_either)

In [29]:
def run_regression(subsetted_data, var_forreg):
    
    df = subsetted_data.copy()
    ## Save for visualization
    bw_length = df['bw'].iloc[0]
        
    ## Put together formula based on variable being passed in
    form = 'irrit_score ~ is_after + ' + var_forreg + ' + is_after*' + var_forreg
    
    ## Run OLS
    reg = sm.ols(formula=form, data=df).fit(use_t=True)
    
    ## Return both the result of the regression and the bandwidth for the visualization
    return(reg, bw_length)

In [30]:
def run_predict(reg, var_forreg):
    
    ## Use regression, not bandwidth length used for visualization, in the prediction
    reg_touse = reg[0]
    
    ## Create values for prediction
    xnew = pd.DataFrame({var_forreg: [True, False, True, False], 'is_after': [False, False, True, True]})
    
    ## Generate predicition
    pred = reg_touse.get_prediction(xnew)
    
    ## Create dataframe to get the mean and SE
    summarydf = pred.summary_frame()
    
    ## Merge dataframes for visualization
    print(summarydf.shape) # Merge diagnostics
    print(xnew.shape)
    summarydf['index'] = summarydf.reset_index().index
    xnew['index'] = xnew.reset_index().index
    pred_withinputs_df = summarydf.merge(xnew, on = 'index')
    print(pred_withinputs_df.shape) # Merge diagnostics
    
    ## Include the bandwidth returned by run_regression() for the visualization
    pred_withinputs_df['bw'] = reg[1]
    pred_withinputs_df['bw'] = pred_withinputs_df['bw'].astype("category")
    
    return(pred_withinputs_df)

In [37]:
## Look at these bandwidths of days before and after each event
bandwidths = [30, 60, 90]
covid_date = "03-11-2020"
gf_date = "05-25-2020"

In [38]:
## Race - COVID

subset_list_covid_race = [subset_data(bw_only, covid_date, bw) for bw in bandwidths]

reg_list_covid_race = [run_regression(sub, 'is_black') for sub in subset_list_covid_race]

pred_list_covid_race = [run_predict(reg, 'is_black') for reg in reg_list_covid_race]

## Compile dataframes for each regression for a visualization
viz_df_covid_race = pd.concat(pred_list_covid_race).reset_index()
print(viz_df_covid_race)


(260, 1)
(269, 1)
(529, 2)
(529, 2)
(559, 1)
(370, 1)
(925, 2)
(921, 2)
(886, 1)
(550, 1)
(1410, 2)
(1384, 2)
(4, 6)
(4, 2)
(4, 9)
(4, 6)
(4, 2)
(4, 9)
(4, 6)
(4, 2)
(4, 9)
    level_0       mean   mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0         0  14.820513  1.181013      12.500551      17.140475     -5.799761   
1         1  17.766304  0.768941      16.255809      19.276799     -2.778648   
2         2  16.223529  1.131338      14.001147      18.445911     -4.385994   
3         3  18.076923  0.746938      16.609650      19.544196     -2.464896   
4         0  14.587838  0.865862      12.888592      16.287084     -6.154101   
5         1  17.540865  0.516455      16.527327      18.554404     -3.156183   
6         2  16.326733  1.048139      14.269770      18.383695     -4.447570   
7         3  17.899642  0.630634      16.662028      19.137255     -2.809590   
8         0  15.483333  0.677142      14.155031      16.811636     -5.137471   
9         1  17.345736  0.4

In [39]:
## LaTeX for Overleaf
print(reg_list_covid_race[0][0].summary().as_latex())
print(reg_list_covid_race[1][0].summary().as_latex())
print(reg_list_covid_race[2][0].summary().as_latex())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}                      &   irrit\_score   & \textbf{  R-squared:         } &     0.012   \\
\textbf{Model:}                              &       OLS        & \textbf{  Adj. R-squared:    } &     0.007   \\
\textbf{Method:}                             &  Least Squares   & \textbf{  F-statistic:       } &     2.241   \\
\textbf{Date:}                               & Mon, 14 Mar 2022 & \textbf{  Prob (F-statistic):} &   0.0825    \\
\textbf{Time:}                               &     18:50:26     & \textbf{  Log-Likelihood:    } &   -2037.9   \\
\textbf{No. Observations:}                   &         542      & \textbf{  AIC:               } &     4084.   \\
\textbf{Df Residuals:}                       &         538      & \textbf{  BIC:               } &     4101.   \\
\textbf{Df Model:}                           &           3      & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{

In [40]:
%store viz_df_covid_race

Stored 'viz_df_covid_race' (DataFrame)


In [41]:
## Race - Death of George Floyd

subset_list_gf_race = [subset_data(bw_only, gf_date, bw) for bw in bandwidths]

reg_list_gf_race = [run_regression(sub, 'is_black') for sub in subset_list_gf_race]

pred_list_gf_race = [run_predict(reg, 'is_black') for reg in reg_list_gf_race]

## Compile dataframes for each regression for a visualization
viz_df_gf_race = pd.concat(pred_list_gf_race).reset_index()
print(viz_df_gf_race)


(114, 1)
(190, 1)
(304, 2)
(304, 2)
(312, 1)
(381, 1)
(692, 2)
(691, 2)
(615, 1)
(582, 1)
(1164, 2)
(1131, 2)
(4, 6)
(4, 2)
(4, 9)
(4, 6)
(4, 2)
(4, 9)
(4, 6)
(4, 2)
(4, 9)
    level_0       mean   mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0         0  16.538462  1.642430      13.306700      19.770223     -3.900993   
1         1  18.037975  1.153999      15.767284      20.308666     -2.271704   
2         2  15.536585  1.601870      12.384633      18.688538     -4.890402   
3         3  18.428571  0.826530      16.802232      20.054911     -1.819194   
4         0  17.688312  1.192146      15.347725      20.028899     -2.983191   
5         1  18.024793  0.672461      16.704524      19.345062     -2.556164   
6         2  17.022989  1.121541      14.821023      19.224954     -3.633278   
7         3  18.596026  0.601965      17.414165      19.777888     -1.976516   
8         0  16.420118  0.801323      14.847904      17.992332     -4.079045   
9         1  17.971698  0.5

In [42]:
## LaTeX for Overleaf
print(reg_list_gf_race[0][0].summary().as_latex())
print(reg_list_gf_race[1][0].summary().as_latex())
print(reg_list_gf_race[2][0].summary().as_latex())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}                      &   irrit\_score   & \textbf{  R-squared:         } &     0.010   \\
\textbf{Model:}                              &       OLS        & \textbf{  Adj. R-squared:    } &     0.001   \\
\textbf{Method:}                             &  Least Squares   & \textbf{  F-statistic:       } &     1.062   \\
\textbf{Date:}                               & Mon, 14 Mar 2022 & \textbf{  Prob (F-statistic):} &    0.365    \\
\textbf{Time:}                               &     18:50:35     & \textbf{  Log-Likelihood:    } &   -1170.8   \\
\textbf{No. Observations:}                   &         313      & \textbf{  AIC:               } &     2350.   \\
\textbf{Df Residuals:}                       &         309      & \textbf{  BIC:               } &     2365.   \\
\textbf{Df Model:}                           &           3      & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{

In [43]:
%store viz_df_gf_race

Stored 'viz_df_gf_race' (DataFrame)


In [44]:
## Gender - COVID

subset_list_covid_gender = [subset_data(mf_only, covid_date, bw) for bw in bandwidths]

reg_list_covid_gender = [run_regression(sub, 'is_male') for sub in subset_list_covid_gender]

pred_list_covid_gender = [run_predict(reg, 'is_male') for reg in reg_list_covid_gender]

## Compile dataframes for each regression for a visualization
viz_df_covid_gender = pd.concat(pred_list_covid_gender).reset_index()
print(viz_df_covid_gender)


(407, 1)
(391, 1)
(798, 2)
(798, 2)
(814, 1)
(541, 1)
(1351, 2)
(1347, 2)
(1265, 1)
(818, 1)
(2047, 2)
(2011, 2)
(4, 6)
(4, 2)
(4, 9)
(4, 6)
(4, 2)
(4, 9)
(4, 6)
(4, 2)
(4, 9)
    level_0       mean   mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0         0  16.323194  0.644429      15.058249      17.588139     -4.229733   
1         1  18.040541  0.859057      16.354303      19.726778     -2.542611   
2         2  17.294545  0.630212      16.057507      18.531584     -3.256683   
3         3  19.046512  0.920149      17.240358      20.852665     -1.546810   
4         0  16.190647  0.445944      15.315843      17.065452     -4.455476   
5         1  18.781481  0.639935      17.526127      20.036836     -1.884265   
6         2  17.021108  0.540130      15.961540      18.080676     -3.633669   
7         3  19.143678  0.797156      17.579905      20.707451     -1.543094   
8         0  16.370499  0.357462      15.669477      17.071522     -4.211408   
9         1  18.500000  

In [45]:
## LaTeX for Overleaf
print(reg_list_covid_gender[0][0].summary().as_latex())
print(reg_list_covid_gender[1][0].summary().as_latex())
print(reg_list_covid_gender[2][0].summary().as_latex())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}                     &   irrit\_score   & \textbf{  R-squared:         } &     0.008   \\
\textbf{Model:}                             &       OLS        & \textbf{  Adj. R-squared:    } &     0.004   \\
\textbf{Method:}                            &  Least Squares   & \textbf{  F-statistic:       } &     2.193   \\
\textbf{Date:}                              & Mon, 14 Mar 2022 & \textbf{  Prob (F-statistic):} &   0.0875    \\
\textbf{Time:}                              &     18:51:09     & \textbf{  Log-Likelihood:    } &   -3067.0   \\
\textbf{No. Observations:}                  &         815      & \textbf{  AIC:               } &     6142.   \\
\textbf{Df Residuals:}                      &         811      & \textbf{  BIC:               } &     6161.   \\
\textbf{Df Model:}                          &           3      & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}

In [46]:
%store viz_df_covid_gender

Stored 'viz_df_covid_gender' (DataFrame)


In [47]:
## Gender - Death of George Floyd

subset_list_gf_gender = [subset_data(mf_only, gf_date, bw) for bw in bandwidths]

reg_list_gf_gender = [run_regression(sub, 'is_male') for sub in subset_list_gf_gender]

pred_list_gf_gender = [run_predict(reg, 'is_male') for reg in reg_list_gf_gender]

## Compile dataframes for each regression for a visualization
viz_df_gf_gender = pd.concat(pred_list_gf_gender).reset_index()
print(viz_df_gf_gender)


(186, 1)
(298, 1)
(484, 2)
(484, 2)
(468, 1)
(577, 1)
(1044, 2)
(1043, 2)
(920, 1)
(881, 1)
(1749, 2)
(1697, 2)
(4, 6)
(4, 2)
(4, 9)
(4, 6)
(4, 2)
(4, 9)
(4, 6)
(4, 2)
(4, 9)
    level_0       mean   mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0         0  18.328358  0.911873      16.536713      20.120003     -2.488677   
1         1  17.879310  1.386030      15.156042      20.602578     -3.038509   
2         2  17.968912  0.759815      16.476029      19.461795     -2.824541   
3         3  17.846847  1.001902      15.878313      19.815381     -2.986158   
4         0  17.904478  0.581584      16.763290      19.045665     -3.013830   
5         1  19.384615  0.890158      17.637944      21.131287     -1.575445   
6         2  18.160000  0.532237      17.115642      19.204358     -2.753248   
7         3  19.731183  0.780510      18.199662      21.262703     -1.212046   
8         0  17.291598  0.425342      16.457359      18.125837     -3.278790   
9         1  19.021661  0

In [48]:
## LaTeX for Overleaf
print(reg_list_gf_gender[0][0].summary().as_latex())
print(reg_list_gf_gender[1][0].summary().as_latex())
print(reg_list_gf_gender[2][0].summary().as_latex())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}                     &   irrit\_score   & \textbf{  R-squared:         } &     0.000   \\
\textbf{Model:}                             &       OLS        & \textbf{  Adj. R-squared:    } &    -0.006   \\
\textbf{Method:}                            &  Least Squares   & \textbf{  F-statistic:       } &   0.05291   \\
\textbf{Date:}                              & Mon, 14 Mar 2022 & \textbf{  Prob (F-statistic):} &    0.984    \\
\textbf{Time:}                              &     18:51:33     & \textbf{  Log-Likelihood:    } &   -1870.7   \\
\textbf{No. Observations:}                  &         496      & \textbf{  AIC:               } &     3749.   \\
\textbf{Df Residuals:}                      &         492      & \textbf{  BIC:               } &     3766.   \\
\textbf{Df Model:}                          &           3      & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}

In [49]:
%store viz_df_gf_gender

Stored 'viz_df_gf_gender' (DataFrame)
