In [8]:
import pandas as pd
import statsmodels.api as sm

In [9]:
#### Read in the required counts datasets ####
fnl_mn_large = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/counts/final_main_large_cities_core_counts.csv')
fnl_mn_med = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/counts/final_main_medium_cities_core_counts.csv')
fnl_mn_sml = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/counts/final_main_small_cities_core_counts.csv')

fnl_mn_wdt_low_3z_large = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/counts/final_main_without_lower_3z_large_cities_core_counts.csv')
fnl_mn_wdt_low_3z_med = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/counts/final_main_without_lower_3z_medium_cities_core_counts.csv')
fnl_mn_wdt_low_3z_sml = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/counts/final_main_without_lower_3z_small_cities_core_counts.csv')

#### Read in the required rates datasets ####
fnl_mn_large_rates = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/rates/final_main_large_cities_core_rates.csv')
fnl_mn_med_rates = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/rates/final_main_medium_cities_core_rates.csv')
fnl_mn_sml_rates = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/rates/final_main_small_cities_core_rates.csv')

fnl_mn_wdt_low_3z_large_rates = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/rates/final_main_without_lower_3z_large_cities_core_rates.csv')
fnl_mn_wdt_low_3z_med_rates = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/rates/final_main_without_lower_3z_medium_cities_core_rates.csv')
fnl_mn_wdt_low_3z_sml_rates = pd.read_csv('/Users/salma/Research/us_crime_data_analysis/data/trends/rates/final_main_without_lower_3z_small_cities_core_rates.csv')

In [26]:
fnl_mn_large.groupby('ORI')['population'].mean().round()

ORI
AK00101    266927.0
AL00100    147178.0
AL00102    231624.0
AL00201    237960.0
AL00301    197330.0
             ...   
WA03400    119971.0
WASPD00    582353.0
WI00502    101903.0
WI01301    216463.0
WIMPD00    603336.0
Name: population, Length: 380, dtype: float64

### REGRESSION MODELS - LARGE AGENCIES

### OLS for core var counts with all

In [11]:
model = sm.OLS.from_formula('violent_crime ~ total_officers + jail_occupancy_count + prison_occupancy_count + drug_tot_arrests + drug_tot_arrests_black + drug_tot_arrests_white + disorder_arrests_tot_index + disorder_tot_arrests_black_index + disorder_tot_arrests_white_index',
                           data = fnl_mn_large)
res = model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:          violent_crime   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.830
Method:                 Least Squares   F-statistic:                     3966.
Date:                Sun, 22 Mar 2020   Prob (F-statistic):               0.00
Time:                        09:59:57   Log-Likelihood:                -68719.
No. Observations:                7324   AIC:                         1.375e+05
Df Residuals:                    7314   BIC:                         1.375e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

#### !!!! Since OLS assumes errors to be ND, it works only for continuous variables. 
####      For counts which are discrete vars, need to use poisson or negative binomial distribution. 
####      Poisson assumes equal mean and variances, which may not be the case for us so use NB.
####      Doing this also helps to see the difference in regression results with OLS and NB !!!! ####

### NB for core counts all

In [18]:
model = sm.GLM.from_formula('violent_crime ~ total_officers + jail_occupancy_count + prison_occupancy_count + drug_tot_arrests + drug_tot_arrests_black + drug_tot_arrests_white + disorder_arrests_tot_index + disorder_tot_arrests_black_index + disorder_tot_arrests_white_index',
                           data = fnl_mn_large, family=sm.families.NegativeBinomial())
res = model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:          violent_crime   No. Observations:                 7324
Model:                            GLM   Df Residuals:                     7314
Model Family:        NegativeBinomial   Df Model:                            9
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -59879.
Date:                Sun, 22 Mar 2020   Deviance:                       5101.8
Time:                        10:35:21   Pearson chi2:                 4.66e+03
No. Iterations:                    81                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

#### !!!! As we can see the significant factors are different with NB when compared to OLS.
####      drug_tot_arrests, drug_tot_arrests_black, drug_tot_arrests_white are not significant with NB
####      whereas they are significant with OLS !!!! 

#### OBS: When perc_felonies, perc_misdemeanors were added, the significance of drug arrests(all, black, white) further decreased. Next trying to add total_felonies_agency, total_misdemeanors_agency in place of perc. But doesn't make sense to add fel, misd in this regr coz its a subset of dependent var violent cri so take out of regression.

#### Goind forward will only use NB for counts

### NB for core counts without low 3z

In [19]:
model = sm.GLM.from_formula('violent_crime ~ total_officers + jail_occupancy_count + prison_occupancy_count + drug_tot_arrests + drug_tot_arrests_black + drug_tot_arrests_white + disorder_arrests_tot_index + disorder_tot_arrests_black_index + disorder_tot_arrests_white_index',
                           data = fnl_mn_wdt_low_3z_large, family=sm.families.NegativeBinomial())
res = model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:          violent_crime   No. Observations:                 7285
Model:                            GLM   Df Residuals:                     7275
Model Family:        NegativeBinomial   Df Model:                            9
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -59554.
Date:                Sun, 22 Mar 2020   Deviance:                       5061.8
Time:                        10:36:06   Pearson chi2:                 4.62e+03
No. Iterations:                    81                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

### OLS for core rates all

In [22]:
model = sm.OLS.from_formula('violent_crime_rate ~ total_officers_rate + jail_occupancy_count_rate + prison_occupancy_count_rate + drug_tot_arrests_rate + drug_tot_arrests_black_rate + drug_tot_arrests_white_rate + disorder_arrests_tot_index_rate + disorder_tot_arrests_black_index_rate + disorder_tot_arrests_white_index_rate',
                            data=fnl_mn_large_rates)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:     violent_crime_rate   R-squared:                       0.373
Model:                            OLS   Adj. R-squared:                  0.373
Method:                 Least Squares   F-statistic:                     484.2
Date:                Sun, 22 Mar 2020   Prob (F-statistic):               0.00
Time:                        10:38:59   Log-Likelihood:                -54430.
No. Observations:                7323   AIC:                         1.089e+05
Df Residuals:                    7313   BIC:                         1.089e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

### OLS for core rates without low 3z

In [24]:
model = sm.OLS.from_formula('violent_crime_rate ~ total_officers_rate + jail_occupancy_count_rate + prison_occupancy_count_rate + drug_tot_arrests_rate + drug_tot_arrests_black_rate + drug_tot_arrests_white_rate + disorder_arrests_tot_index_rate + disorder_tot_arrests_black_index_rate + disorder_tot_arrests_white_index_rate',
                            data=fnl_mn_wdt_low_3z_large_rates)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:     violent_crime_rate   R-squared:                       0.375
Model:                            OLS   Adj. R-squared:                  0.375
Method:                 Least Squares   F-statistic:                     485.9
Date:                Sun, 22 Mar 2020   Prob (F-statistic):               0.00
Time:                        10:40:35   Log-Likelihood:                -54132.
No. Observations:                7284   AIC:                         1.083e+05
Df Residuals:                    7274   BIC:                         1.084e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

### REGRESSION MODELS - MEDIUM AGENCIES

### NB for core counts all

In [27]:
model = sm.GLM.from_formula('violent_crime ~ total_officers + jail_occupancy_count + prison_occupancy_count + drug_tot_arrests + drug_tot_arrests_black + drug_tot_arrests_white + disorder_arrests_tot_index + disorder_tot_arrests_black_index + disorder_tot_arrests_white_index',
                           data = fnl_mn_med, family=sm.families.NegativeBinomial())
res = model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:          violent_crime   No. Observations:                12187
Model:                            GLM   Df Residuals:                    12177
Model Family:        NegativeBinomial   Df Model:                            9
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -78539.
Date:                Sun, 22 Mar 2020   Deviance:                       5987.9
Time:                        13:24:32   Pearson chi2:                 6.24e+03
No. Iterations:                    20                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

### NB for core counts all without low 3z

In [29]:
model = sm.GLM.from_formula('violent_crime ~ total_officers + jail_occupancy_count + prison_occupancy_count + drug_tot_arrests + drug_tot_arrests_black + drug_tot_arrests_white + disorder_arrests_tot_index + disorder_tot_arrests_black_index + disorder_tot_arrests_white_index',
                            data=fnl_mn_wdt_low_3z_med, family=sm.families.NegativeBinomial())
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:          violent_crime   No. Observations:                12130
Model:                            GLM   Df Residuals:                    12120
Model Family:        NegativeBinomial   Df Model:                            9
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -78167.
Date:                Sun, 22 Mar 2020   Deviance:                       5936.1
Time:                        13:37:52   Pearson chi2:                 6.20e+03
No. Iterations:                    20                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

### OLS for core rates all

In [33]:
model = sm.OLS.from_formula('violent_crime_rate ~ total_officers_rate + jail_occupancy_count_rate + prison_occupancy_count_rate + drug_tot_arrests_rate + drug_tot_arrests_black_rate + drug_tot_arrests_white_rate + disorder_arrests_tot_index_rate + disorder_tot_arrests_black_index_rate + disorder_tot_arrests_white_index_rate',
                            data=fnl_mn_med_rates)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:     violent_crime_rate   R-squared:                       0.328
Model:                            OLS   Adj. R-squared:                  0.328
Method:                 Least Squares   F-statistic:                     659.7
Date:                Sun, 22 Mar 2020   Prob (F-statistic):               0.00
Time:                        13:43:00   Log-Likelihood:                -88180.
No. Observations:               12163   AIC:                         1.764e+05
Df Residuals:                   12153   BIC:                         1.765e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

### OLS for core rates without low 3z

In [34]:
model = sm.OLS.from_formula('violent_crime_rate ~ total_officers_rate + jail_occupancy_count_rate + prison_occupancy_count_rate + drug_tot_arrests_rate + drug_tot_arrests_black_rate + drug_tot_arrests_white_rate + disorder_arrests_tot_index_rate + disorder_tot_arrests_black_index_rate + disorder_tot_arrests_white_index_rate',
                            data=fnl_mn_wdt_low_3z_med_rates)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:     violent_crime_rate   R-squared:                       0.328
Model:                            OLS   Adj. R-squared:                  0.328
Method:                 Least Squares   F-statistic:                     657.2
Date:                Sun, 22 Mar 2020   Prob (F-statistic):               0.00
Time:                        13:43:58   Log-Likelihood:                -87763.
No. Observations:               12106   AIC:                         1.755e+05
Df Residuals:                   12096   BIC:                         1.756e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

### REGRESSION MODELS - SMALL AGENCIES

### NB for core counts all

In [36]:
model = sm.GLM.from_formula('violent_crime ~ total_officers + jail_occupancy_count + prison_occupancy_count + drug_tot_arrests + drug_tot_arrests_black + drug_tot_arrests_white + disorder_arrests_tot_index + disorder_tot_arrests_black_index + disorder_tot_arrests_white_index',
                           data=fnl_mn_sml, family=sm.families.NegativeBinomial())
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:          violent_crime   No. Observations:                69480
Model:                            GLM   Df Residuals:                    69470
Model Family:        NegativeBinomial   Df Model:                            9
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.4319e+05
Date:                Sun, 22 Mar 2020   Deviance:                       49565.
Time:                        13:46:28   Pearson chi2:                 5.41e+04
No. Iterations:                    23                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

### NB for core counts without low 3z

In [38]:
model = sm.GLM.from_formula('violent_crime ~ total_officers + jail_occupancy_count + prison_occupancy_count + drug_tot_arrests + drug_tot_arrests_black + drug_tot_arrests_white + disorder_arrests_tot_index + disorder_tot_arrests_black_index + disorder_tot_arrests_white_index',
                           data=fnl_mn_wdt_low_3z_sml, family=sm.families.NegativeBinomial())
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:          violent_crime   No. Observations:                69193
Model:                            GLM   Df Residuals:                    69183
Model Family:        NegativeBinomial   Df Model:                            9
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.4179e+05
Date:                Sun, 22 Mar 2020   Deviance:                       49237.
Time:                        13:51:05   Pearson chi2:                 5.38e+04
No. Iterations:                    23                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

### OLS for core rates all

In [41]:
model = sm.OLS.from_formula('violent_crime_rate ~ total_officers_rate + jail_occupancy_count_rate + prison_occupancy_count_rate + drug_tot_arrests_rate + drug_tot_arrests_black_rate + drug_tot_arrests_white_rate + disorder_arrests_tot_index_rate + disorder_tot_arrests_black_index_rate + disorder_tot_arrests_white_index_rate',
                            data=fnl_mn_sml_rates)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:     violent_crime_rate   R-squared:                       0.244
Model:                            OLS   Adj. R-squared:                  0.244
Method:                 Least Squares   F-statistic:                     2472.
Date:                Sun, 22 Mar 2020   Prob (F-statistic):               0.00
Time:                        13:52:37   Log-Likelihood:            -4.9493e+05
No. Observations:               69022   AIC:                         9.899e+05
Df Residuals:                   69012   BIC:                         9.900e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

### OLS for core rates without low 3z

In [42]:
model = sm.OLS.from_formula('violent_crime_rate ~ total_officers_rate + jail_occupancy_count_rate + prison_occupancy_count_rate + drug_tot_arrests_rate + drug_tot_arrests_black_rate + drug_tot_arrests_white_rate + disorder_arrests_tot_index_rate + disorder_tot_arrests_black_index_rate + disorder_tot_arrests_white_index_rate',
                            data=fnl_mn_wdt_low_3z_sml_rates)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:     violent_crime_rate   R-squared:                       0.244
Model:                            OLS   Adj. R-squared:                  0.244
Method:                 Least Squares   F-statistic:                     2459.
Date:                Sun, 22 Mar 2020   Prob (F-statistic):               0.00
Time:                        13:52:53   Log-Likelihood:            -4.9284e+05
No. Observations:               68736   AIC:                         9.857e+05
Df Residuals:                   68726   BIC:                         9.858e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------