This notebook contains:

1. Logistic regression with only counties

2. Speeding only and for all stops:

Rate of citation (normalize by # of stops or population) for each county

Rate of arrest (normalize by # of stops or population) fpr each county

3. Look at stop reasons (is there any other general category like speeding)

4. diversity index

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

  import pandas.util.testing as tm


In [None]:
from google.colab import drive
import os 
drive.mount('/gdrive/')

Mounted at /gdrive/


### Logistic Regression

In [None]:
def run_year_analysis(data_path, output_path,df_years, model, school = False):
  """
  Input:
  #data_path: the folder path where all the yearly-based parquet files are saved

  #output_path: output path for csv files -> not used yet, because I want to print out the result and directly 
  #save it afterwards. After we make sure there is no problem about how I did the analysis, we integrate this part
  #into the function

  keep_col, rm_na_col,violation_type: parameters for preprocessing
  df_years: an empty data frame where we can save the logistic regression results
  model: 'baseline' or 'interaction'
  Output:
  A result dataframe
  """

  years = list(range(2006,2013))
  years += [2014,2015,2016,2017]
  for year in years:
    # read in and preprocess the data
    if school == False:
      print('***********************************')
      print(year)
      filename = '/gdrive/MyDrive/traffic_stop/year_data_speeding_only/traffic_' + str(year) + '.parquet'
      df = pd.read_parquet(filename, engine = 'pyarrow')
      # all only speeding violations
      print(year,': # speeding only: ', len(df))
      #df = df.loc[df['violation'] == 'speeding over limit',:]
      #print('# speeding over limit: ', len(df))
    if school == True:
      print('***********************************')
      print('school zone')
      df = pd.DataFrame()
      for year in years:
        filename = '/gdrive/MyDrive/traffic_stop/year_data_speeding_only/traffic_' + str(year) + '.parquet'
        df_temp = pd.read_parquet(filename, engine = 'pyarrow')
        df_temp = df_temp.loc[df_temp['violation'] == 'speeding-school zone',:]
        df = pd.concat([df,df_temp], ignore_index = True)
      print(df['county_type'].value_counts())
      print('# of school zone speeding:', len(df))
      print(df['citation_issued'].value_counts())

    y = df['citation_issued']
    X = df.drop('citation_issued', axis = 1)
    X.drop(['sunset','sunrise','dawn','dusk','stop_time','time','lat','lng','date'], 
           axis = 1, inplace = True)
    #X['subject_race'] = df.subject_race.cat.remove_unused_categories()
    # note that in some columns, we could have all 0 values, this makes the matrix singular and cannot run logistic regression

    # ADD HERE !!!!!!!!!!!!!!
    # drop variables not used
    X.drop(['contraband_found','contraband_drugs','contraband_weapons','search_conducted',
              'subject_sex','light_cat', 'violation', 'search_vehicle', 'all_violation',
              'speeding_only','county_type','holiday','subject_race'], axis = 1, inplace = True)

    # get dummies for race,county,holiday
    X = pd.get_dummies(X)
    #if year == 2013:
      #for col in X.columns:
        #if len(X.loc[X[col] == 0,:]) == len(X):
          #X.drop(col, axis = 1, inplace = True)
          #print(year, ': ','Delete ', col)

    print(X.columns)

    # for baseline model
    if model == 'baseline':
      ######0303:CHANGE HERE!!!!!! Base level: White, Austin, holiday = 0
      X.drop(['county_name_Austin County'], axis = 1, inplace = True)

    #adding constant to X
    X_with_constant = sm.add_constant(X)
    # building the model and fitting the data
    log_reg = sm.Logit(y, X_with_constant).fit()
    
    res_df = pd.DataFrame({'variable':list(log_reg.params.index), 'coef':list(log_reg.params.values), 
                           'odds_ratio':list(np.exp(log_reg.params.values)),
                           'lower_CI':list(np.exp(log_reg.conf_int()[0].values)),
                           'upper_CI':list(np.exp(log_reg.conf_int()[1].values)),'pvalue':list(log_reg.pvalues)})
    if school == True:
      return res_df
    
    df_years = df_years.merge(res_df, left_on = 'variable', right_on = 'variable', how = 'outer')
    df_years = df_years.rename(columns = {'coef': ('coef_' + str(year)), 
                                          'odds_ratio': ('odds_ratio_' + str(year)),
                                          'lower_CI': ('OR_lower_CI_' + str(year)),
                                          'upper_CI': ('OR_upper_CI_' + str(year)), 
                                          'pvalue': ('pvalue_' + str(year))})

  return df_years

# write result df to csv file


In [None]:
#var_lst_interaction = ['subject_race_black', 'subject_race_hispanic', 'subject_race_asian/pacific islander',
                    #'holiday', 'county_type_Non core', 'county_type_Micropolitan',
                    #'hispanic_metro','black_metro','asian/paci_metro']

#####0303: Change this line, get all variable names!
df = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data_speeding_only/traffic_' + str(2015) + '.parquet')
unq_counties = list(df['county_name'].unique())
county_names = [('county_name_' + x) for x in unq_counties if x != 'Austin county']
#var_lst_baseline = ['subject_race_black', 'subject_race_hispanic', 'subject_race_asian/pacific islander'] + county_names
var_lst_baseline = county_names

# num_feat = len(df_16.columns)

# this df is later pass into run_year_analysis to get all estimates
#df_years = pd.DataFrame({'variable':var_lst_interaction})
df_years = pd.DataFrame({'variable':var_lst_baseline})


Logit usually converges fast, maxi_iteration is 35. Check for the levels of all variables. It might be possible that one amongst them would have almost 99% of one category.

2013 has singular matrix.


Years that exceed maximum iteration:

2011

2014

In [None]:
data_path = '/gdrive/MyDrive/traffic_stop/year_data_speeding_only/'
res_df = run_year_analysis(data_path, output_path = ' ', df_years = df_years, school = False, model = 'baseline')

***********************************
2006
2006 : # speeding only:  1234128
Index(['county_name_Anderson County', 'county_name_Andrews County',
       'county_name_Angelina County', 'county_name_Aransas County',
       'county_name_Archer County', 'county_name_Armstrong County',
       'county_name_Atascosa County', 'county_name_Austin County',
       'county_name_Bailey County', 'county_name_Bandera County',
       ...
       'county_name_Willacy County', 'county_name_Williamson County',
       'county_name_Wilson County', 'county_name_Winkler County',
       'county_name_Wise County', 'county_name_Wood County',
       'county_name_Yoakum County', 'county_name_Young County',
       'county_name_Zapata County', 'county_name_Zavala County'],
      dtype='object', length=254)


  x = pd.concat(x[::order], 1)


Optimization terminated successfully.
         Current function value: 0.629638
         Iterations 6
***********************************
2007
2007 : # speeding only:  1110125
Index(['county_name_Anderson County', 'county_name_Andrews County',
       'county_name_Angelina County', 'county_name_Aransas County',
       'county_name_Archer County', 'county_name_Armstrong County',
       'county_name_Atascosa County', 'county_name_Austin County',
       'county_name_Bailey County', 'county_name_Bandera County',
       ...
       'county_name_Willacy County', 'county_name_Williamson County',
       'county_name_Wilson County', 'county_name_Winkler County',
       'county_name_Wise County', 'county_name_Wood County',
       'county_name_Yoakum County', 'county_name_Young County',
       'county_name_Zapata County', 'county_name_Zavala County'],
      dtype='object', length=254)
Optimization terminated successfully.
         Current function value: 0.628965
         Iterations 6
*************



***********************************
2012
2012 : # speeding only:  1023783
Index(['county_name_Anderson County', 'county_name_Andrews County',
       'county_name_Angelina County', 'county_name_Aransas County',
       'county_name_Archer County', 'county_name_Armstrong County',
       'county_name_Atascosa County', 'county_name_Austin County',
       'county_name_Bailey County', 'county_name_Bandera County',
       ...
       'county_name_Willacy County', 'county_name_Williamson County',
       'county_name_Wilson County', 'county_name_Winkler County',
       'county_name_Wise County', 'county_name_Wood County',
       'county_name_Yoakum County', 'county_name_Young County',
       'county_name_Zapata County', 'county_name_Zavala County'],
      dtype='object', length=254)
Optimization terminated successfully.
         Current function value: 0.559198
         Iterations 7
***********************************
2014
2014 : # speeding only:  777272
Index(['county_name_Anderson County', 'cou



***********************************
2015
2015 : # speeding only:  728211
Index(['county_name_Anderson County', 'county_name_Andrews County',
       'county_name_Angelina County', 'county_name_Aransas County',
       'county_name_Archer County', 'county_name_Armstrong County',
       'county_name_Atascosa County', 'county_name_Austin County',
       'county_name_Bailey County', 'county_name_Bandera County',
       ...
       'county_name_Willacy County', 'county_name_Williamson County',
       'county_name_Wilson County', 'county_name_Winkler County',
       'county_name_Wise County', 'county_name_Wood County',
       'county_name_Yoakum County', 'county_name_Young County',
       'county_name_Zapata County', 'county_name_Zavala County'],
      dtype='object', length=254)
Optimization terminated successfully.
         Current function value: 0.567624
         Iterations 7
***********************************
2016
2016 : # speeding only:  796595
Index(['county_name_Anderson County', 'coun

In [None]:
res_df

Unnamed: 0,variable,coef_2006,odds_ratio_2006,OR_lower_CI_2006,OR_upper_CI_2006,pvalue_2006,coef_2007,odds_ratio_2007,OR_lower_CI_2007,OR_upper_CI_2007,...,coef_2016,odds_ratio_2016,OR_lower_CI_2016,OR_upper_CI_2016,pvalue_2016,coef_2017,odds_ratio_2017,OR_lower_CI_2017,OR_upper_CI_2017,pvalue_2017
0,county_name_Jasper County,0.230644,1.259410,1.168080,1.357881,1.915552e-09,0.439895,1.552545,1.438878,1.675191,...,0.033116,1.033670,0.923137,1.157439,5.660297e-01,0.281541,1.325170,1.186423,1.480143,6.058978e-07
1,county_name_Refugio County,0.403431,1.496951,1.396227,1.604941,7.287641e-30,0.403094,1.496448,1.384002,1.618029,...,0.167149,1.181931,1.002252,1.393821,4.695508e-02,0.978438,2.660296,2.349811,3.011807,7.252367e-54
2,county_name_Denton County,0.299645,1.349380,1.267211,1.436878,8.949667e-21,-0.011450,0.988616,0.921570,1.060539,...,0.749285,2.115487,1.896746,2.359453,2.868314e-41,1.246137,3.476887,3.126508,3.866532,5.413836e-117
3,county_name_Rusk County,-0.517859,0.595795,0.555044,0.639537,1.501045e-46,-0.238501,0.787808,0.728055,0.852465,...,-0.025982,0.974352,0.865322,1.097120,6.678332e-01,0.136133,1.145834,1.020330,1.286775,2.144824e-02
4,county_name_Gonzales County,0.421574,1.524359,1.421446,1.634722,3.045505e-32,0.727734,2.070385,1.922352,2.229816,...,1.824190,6.197773,5.538304,6.935768,1.208651e-221,1.898741,6.677480,5.960427,7.480796,2.203849e-235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,county_name_Edwards County,-0.530091,0.588551,0.477757,0.725041,6.309118e-07,-0.078999,0.924040,0.710628,1.201544,...,-0.265521,0.766806,0.494823,1.188287,2.348083e-01,0.304757,1.356296,1.100378,1.671732,4.282774e-03
251,county_name_Kent County,-1.144708,0.318317,0.160543,0.631142,1.046266e-03,-0.214042,0.807314,0.469094,1.389393,...,-0.204997,0.814650,0.418169,1.587048,5.468450e-01,-0.283135,0.753418,0.367930,1.542790,4.387756e-01
252,county_name_Borden County,-0.290293,0.748044,0.530660,1.054479,9.749052e-02,-0.275831,0.758941,0.482793,1.193040,...,1.655755,5.237033,2.019388,13.581593,6.606489e-04,0.333051,1.395218,0.144809,13.442742,7.732334e-01
253,county_name_Loving County,0.202184,1.224073,0.506490,2.958310,6.533830e-01,-0.378110,0.685155,0.181549,2.585733,...,-0.500978,0.605938,0.133858,2.742901,5.155199e-01,1.726463,5.620735,3.587810,8.805558,4.785320e-14


In [None]:
res_df.to_excel('LR_county_only.xlsx', index = False)

In [None]:
#select_cols = [col for col in res_df.columns if 'pvalue' not in col]
res_df['variable']=res_df['variable'].map(lambda x: x.replace('county_name_',''))
res_df

Unnamed: 0,variable,coef_2006,odds_ratio_2006,OR_lower_CI_2006,OR_upper_CI_2006,pvalue_2006,coef_2007,odds_ratio_2007,OR_lower_CI_2007,OR_upper_CI_2007,...,coef_2016,odds_ratio_2016,OR_lower_CI_2016,OR_upper_CI_2016,pvalue_2016,coef_2017,odds_ratio_2017,OR_lower_CI_2017,OR_upper_CI_2017,pvalue_2017
0,Jasper County,0.230644,1.259410,1.168080,1.357881,1.915552e-09,0.439895,1.552545,1.438878,1.675191,...,0.033116,1.033670,0.923137,1.157439,5.660297e-01,0.281541,1.325170,1.186423,1.480143,6.058978e-07
1,Refugio County,0.403431,1.496951,1.396227,1.604941,7.287641e-30,0.403094,1.496448,1.384002,1.618029,...,0.167149,1.181931,1.002252,1.393821,4.695508e-02,0.978438,2.660296,2.349811,3.011807,7.252367e-54
2,Denton County,0.299645,1.349380,1.267211,1.436878,8.949667e-21,-0.011450,0.988616,0.921570,1.060539,...,0.749285,2.115487,1.896746,2.359453,2.868314e-41,1.246137,3.476887,3.126508,3.866532,5.413836e-117
3,Rusk County,-0.517859,0.595795,0.555044,0.639537,1.501045e-46,-0.238501,0.787808,0.728055,0.852465,...,-0.025982,0.974352,0.865322,1.097120,6.678332e-01,0.136133,1.145834,1.020330,1.286775,2.144824e-02
4,Gonzales County,0.421574,1.524359,1.421446,1.634722,3.045505e-32,0.727734,2.070385,1.922352,2.229816,...,1.824190,6.197773,5.538304,6.935768,1.208651e-221,1.898741,6.677480,5.960427,7.480796,2.203849e-235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,Edwards County,-0.530091,0.588551,0.477757,0.725041,6.309118e-07,-0.078999,0.924040,0.710628,1.201544,...,-0.265521,0.766806,0.494823,1.188287,2.348083e-01,0.304757,1.356296,1.100378,1.671732,4.282774e-03
251,Kent County,-1.144708,0.318317,0.160543,0.631142,1.046266e-03,-0.214042,0.807314,0.469094,1.389393,...,-0.204997,0.814650,0.418169,1.587048,5.468450e-01,-0.283135,0.753418,0.367930,1.542790,4.387756e-01
252,Borden County,-0.290293,0.748044,0.530660,1.054479,9.749052e-02,-0.275831,0.758941,0.482793,1.193040,...,1.655755,5.237033,2.019388,13.581593,6.606489e-04,0.333051,1.395218,0.144809,13.442742,7.732334e-01
253,Loving County,0.202184,1.224073,0.506490,2.958310,6.533830e-01,-0.378110,0.685155,0.181549,2.585733,...,-0.500978,0.605938,0.133858,2.742901,5.155199e-01,1.726463,5.620735,3.587810,8.805558,4.785320e-14


In [None]:
coefs = ['variable']

years = list(range(2006,2013))
years += [2014,2015,2016,2017]

for year in years:
  coefs.append('coef_'+str(year))

output_df = res_df[coefs]
output_df.set_index('variable')
output_df = output_df.T
output_df.columns = output_df.iloc[0]
output_df = output_df.iloc[1:,:]
output_df['year'] = years
output_df['filter'] = ['coef']*11
output_df

variable,Jasper County,Refugio County,Denton County,Rusk County,Gonzales County,San Patricio County,Starr County,Hidalgo County,Webb County,Brazoria County,...,Cochran County,Reagan County,Motley County,Edwards County,Kent County,Borden County,Loving County,const,year,filter
coef_2006,0.230644,0.403431,0.299645,-0.517859,0.421574,1.11294,-0.076038,0.142495,0.570828,0.047554,...,0.888362,-1.070852,-0.61154,-0.530091,-1.144708,-0.290293,0.202184,-0.402854,2006,coef
coef_2007,0.439895,0.403094,-0.01145,-0.238501,0.727734,1.281225,0.296974,0.468178,0.465565,0.261104,...,0.796875,-0.987265,-0.341283,-0.078999,-0.214042,-0.275831,-0.37811,-0.602719,2007,coef
coef_2008,0.991492,0.886557,0.684559,0.473578,1.698456,2.205993,-0.45804,0.671458,-0.334116,0.980939,...,1.367505,0.014805,0.453314,-1.506578,-0.300875,0.940588,0.998408,-1.403873,2008,coef
coef_2009,0.679524,0.590625,0.665077,0.395634,1.249007,2.090147,0.25829,0.787447,0.257585,0.559541,...,0.78828,-0.665382,0.02191,-0.413078,-0.500089,0.560783,1.29167,-1.29167,2009,coef
coef_2010,0.556925,0.422574,0.301367,0.281748,1.155042,1.373214,0.027904,0.618091,-0.203339,0.287511,...,0.691996,-0.519244,-0.015696,-0.404038,-0.765062,-0.242612,1.446869,-1.159187,2010,coef
coef_2011,0.227353,0.297059,0.473117,0.014949,1.008458,1.125571,-0.908358,0.533095,-0.269377,0.371508,...,0.532821,-0.444615,-0.37188,-0.595769,-0.451056,0.603385,12.888922,-1.073388,2011,coef
coef_2012,-0.119076,0.666562,0.627772,-0.209614,0.705967,0.905224,-0.429461,0.517699,-0.13973,0.245669,...,0.439631,0.37613,-0.225673,-0.394829,-0.840279,0.636644,1.648244,-0.955097,2012,coef
coef_2014,0.017834,-0.253731,0.238044,-0.303466,0.843231,1.19333,-2.016859,-0.11724,-0.645298,0.825384,...,0.607428,-0.453868,-0.092276,0.42788,-0.742499,0.16072,14.310752,-0.949177,2014,coef
coef_2015,0.401115,0.518624,0.983982,0.205183,1.720161,1.520195,-1.436778,0.453591,-0.011807,1.470587,...,1.423145,1.367507,-0.407835,0.77082,0.452366,0.77082,1.869433,-1.463967,2015,coef
coef_2016,0.033116,0.167149,0.749285,-0.025982,1.82419,1.004429,-1.104399,0.424475,-0.078031,1.373923,...,1.832379,0.431842,-0.203144,-0.265521,-0.204997,1.655755,-0.500978,-1.20377,2016,coef


In [None]:
coef_2010 = pd.Series(output_df.loc['coef_2010'][:-3]).sort_values(ascending=False)

In [None]:
df_2010 = pd.DataFrame({'county':coef_2010.index,'coef':coef_2010.values})

In [None]:
df_2010.to_excel('LR_coef_onlycounty_2010.xlsx', index = False)

In [None]:
df = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data_speeding_only/traffic_' + str(2014) + '.parquet')
df['county_name'].value_counts(dropna = False)

Hidalgo County       26007
Starr County         20129
Montgomery County    17312
Smith County         13500
Harris County        12567
                     ...  
Edwards County          51
Motley County           46
Kent County             45
Borden County           16
Loving County            8
Name: county_name, Length: 254, dtype: int64

ignore the cell below

In [None]:
#select_cols = [col for col in res_df.columns if 'pvalue' not in col]
res_df['variable']=res_df['variable'].map(lambda x: x.replace('county_name_',''))

coefs = ['variable']
odds = ['variable']
odds_lows = ['variable']
odds_highs = ['variable']
for year in list(range(2006, 2018)):
  coefs.append('coef_'+str(year))
  odds.append('odds_ratio_'+str(year))
  odds_lows.append('OR_lower_CI_'+str(year))
  odds_highs.append('OR_upper_CI_'+str(year))

output_df = res_df[coefs]
output_df.set_index('variable')
output_df = output_df.T
output_df['year'] = list(range(2006, 2018))
output_df['filter'] = 'coef'

temp_df = res_df[odds]
temp_df.set_index('variable')
temp_df = temp_df.T
temp_df['year'] = list(range(2006, 2018))
temp_df['filter'] = 'odds'
output_df = pd.concat([output_df,temp_df], ignore_index = True)


temp_df = res_df[odds_lows]
temp_df.set_index('variable')
temp_df = temp_df.T
temp_df['year'] = list(range(2006, 2018))
temp_df['filter'] = 'ci_low'
output_df = pd.concat([output_df,temp_df], ignore_index = True)


temp_df = res_df[odds_highs]
temp_df.set_index('variable')
temp_df = temp_df.T
temp_df['year'] = list(range(2006, 2018))
temp_df['filter'] = 'ci_high'
output_df = pd.concat([output_df,temp_df], ignore_index = True)

output_df

### 2. Citation Rate and Arrest Rate 

(By stops) & (By population)

(All stops & Only speeding stops)

Note that 'normalize by population' is calculated by:
using the total population of each county on social explorer data

Exploratory: Note that arrest cnt is 0 for all years.

In [None]:
df = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data/traffic_' + str(2010) + '.parquet')
df.columns

Index(['raw_row_number', 'date', 'time', 'location', 'lat', 'lng',
       'county_name', 'district', 'precinct', 'region', 'subject_race',
       'subject_sex', 'officer_id_hash', 'type', 'violation',
       'contraband_drugs', 'contraband_weapons', 'search_conducted',
       'search_vehicle', 'search_basis', 'vehicle_color', 'vehicle_make',
       'vehicle_model', 'vehicle_type', 'vehicle_year', 'raw_HA_RACE_SEX',
       'raw_HA_SEARCH_PC_boolean', 'raw_HA_SEARCH_CONCENT_boolean',
       'raw_HA_INCIDTO_ARREST_boolean', 'raw_HA_VEHICLE_INVENT_boolean',
       'year'],
      dtype='object')

If both citation=True and warning=True, outcome is citation. 
If citation=False and warning=True,outcome is warning.

In [None]:
df['outcome'].value_counts(dropna = False)

citation     848789
NaN              30
summons           0
arrest            0
Name: outcome, dtype: int64

In [None]:
df['citation_issued'].value_counts(dropna = False)

False    1676507
True      848789
Name: citation_issued, dtype: int64

In [None]:
df['warning_issued'].value_counts(dropna = False)

True     2011096
False     514200

In [None]:
len(df.loc[(df['warning_issued']==True) & (df['citation_issued']==True),:])

334619

In [None]:
1676477+334619

2011096

In [None]:
for year in list(range(2006, 2018)):
  df = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data/traffic_' + str(year) + '.parquet')
  print(year)
  print(df['outcome'].value_counts(dropna = False))
  print('********************')

2006
citation    1207142
NaN              79
summons           0
arrest            0
Name: outcome, dtype: int64
********************
2007
citation    1071775
NaN             217
summons           0
arrest            0
Name: outcome, dtype: int64
********************
2008
citation     973574
NaN             263
summons           0
arrest            0
Name: outcome, dtype: int64
********************
2009
citation     896255
NaN              36
summons           0
arrest            0
Name: outcome, dtype: int64
********************
2010
citation     848789
NaN              30
summons           0
arrest            0
Name: outcome, dtype: int64
********************
2011
citation     857791
NaN              13
summons           0
arrest            0
Name: outcome, dtype: int64
********************
2012
citation     850791
NaN              14
summons           0
arrest            0
Name: outcome, dtype: int64
********************
2013
citation     765695
NaN             773
summons          

start here:

In [None]:
def output_rates(year=2010, speeding_only = False):
  # all traffic stops
  if speeding_only == False:
    df = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data/traffic_' + str(year) + '.parquet')
  else:
    df = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data_speeding_only/traffic_' + str(year) + '.parquet')
  
  counties = []
  total_stops = []
  citation_cnts = []
  citation_rates = []
  # warning_rates means only warning, no citation
  warning_rates = []

  for county in unq_counties:
    temp_df = df.loc[df['county_name'] == county,:]
    counties.append(county)
    total_stops.append(len(temp_df))
    citation_cnts.append(len(temp_df.loc[temp_df['citation_issued'] == True,:]))
    citation_rates.append(len(temp_df.loc[temp_df['citation_issued'] == True,:])/len(temp_df))
    if not speeding_only:
      warning_rates.append(len(temp_df.loc[temp_df['outcome'] == 'warning',:])/len(temp_df))
    # checked: 'outcome' column contains same info as 'citation_issued' column
    #print(len(temp_df.loc[temp_df['citation_issued'] == True,:]))
    #print(len(temp_df.loc[temp_df['outcome'] == 'citation',:]))
  if not speeding_only:
    rate_stop_df = pd.DataFrame({'county':counties, 'total_stops':total_stops,'total_cites':citation_cnts,'citation_rate(stops)':citation_rates,'warning_rate(stops)':warning_rates})
  else:
    rate_stop_df = pd.DataFrame({'county':counties, 'total_stops':total_stops,'total_cites':citation_cnts,'citation_rate(stops)':citation_rates})
  rate_stop_df.loc[rate_stop_df['county'] == 'Dewitt County','county'] = 'DeWitt County'

  # reformat pop files
  pop = pd.read_excel('/content/county_pop_10.xlsx')
  new_header = pop.iloc[0] #grab the first row for the header
  pop.columns = new_header #set the header row as the df header

  orig_lst = [x for x in list(pop.columns) if pd.isnull(x) == False]
  county_lst = [x.replace(', Texas','') for x in list(pop.columns) if pd.isnull(x) == False]
  pop = pop.rename(columns = dict(zip(orig_lst,county_lst)))
  rate_stop_df['population'] = rate_stop_df['county'].map(lambda x: pop.loc[pop['Statistics']=='Total Population:',x])
  rate_stop_df['population'] = rate_stop_df['population'].map(lambda x: int(x))
  rate_stop_df['stop_rate(pop)'] = rate_stop_df.apply(lambda row: (round(row['total_stops']/row['population'],4)), axis = 1)
  rate_stop_df['citation_rate(pop)'] = rate_stop_df.apply(lambda row: (round(row['total_cites']/row['population'],4)), axis = 1)

  #rate_stop_df.sort_values(by='citation_rate(stops)', ascending = False)

  # read in county_info
  county_info = pd.read_csv('/gdrive/MyDrive/traffic_stop/table_county.csv')
    
  # preprocessing county_info dataframe
  # minus sign, delete celsius sign
  county_info['Longitude'] = county_info['Longitude'].map(lambda x: '-' + x[1:])
  for col in ['Latitude','Longitude']:
    county_info[col] = county_info[col].map(lambda x: x[:-1])
    county_info[col] = county_info[col].astype('float')

  # we are focusing on Texas
  county_info = county_info.loc[county_info['State'] == 'TX',:]
  # rename county info's column
  county_info = county_info[['State','County [2]','Latitude','Longitude']]
  county_info.rename(columns={"County [2]": "county"}, inplace = True)

  # make a 'new_county' column to correspond to the identifiers in the county_info csv
  rate_stop_df['county'] = rate_stop_df['county'].map(lambda x: x.replace(' County', ''))
  cnt_df_merged = rate_stop_df.merge(county_info, how = 'inner', left_on = 'county', right_on = 'county')

  cnt_df_merged.drop(['State'], axis = 1, inplace = True)
  cnt_df_merged = cnt_df_merged.sort_values(by='citation_rate(stops)', ascending = False)
  return cnt_df_merged 

In [None]:
rate_df = output_rates()
rate_df

Unnamed: 0,county,total_stops,total_cites,citation_rate(stops),warning_rate(stops),population,stop_rate(pop),citation_rate(pop),Latitude,Longitude
90,Hall,1693,1039,0.613703,0.386297,3359,0.5040,0.3093,34.453189,-100.576343
46,Sutton,7792,4441,0.569944,0.430056,4062,1.9183,1.0933,30.517865,-100.505395
244,Lipscomb,241,136,0.564315,0.435685,3283,0.0734,0.0414,36.280200,-100.272683
35,Cameron,44213,24884,0.562821,0.437179,407630,0.1085,0.0610,26.102923,-97.478958
202,Caldwell,5888,3245,0.551121,0.448879,38127,0.1544,0.0851,29.840422,-97.631097
...,...,...,...,...,...,...,...,...,...,...
230,Hansford,1010,161,0.159406,0.840594,5599,0.1804,0.0288,36.272847,-101.356930
223,Jeff Davis,3471,529,0.152406,0.847594,2344,1.4808,0.2257,30.617087,-104.187860
138,Stonewall,2195,324,0.147608,0.852392,1496,1.4672,0.2166,33.179580,-100.253807
42,Castro,3941,577,0.146410,0.853590,8126,0.4850,0.0710,34.533621,-102.258786


In [None]:
speeding_rate_df = output_rates(speeding_only=True)
speeding_rate_df

Unnamed: 0,county,total_stops,total_cites,citation_rate(stops),population,stop_rate(pop),citation_rate(pop),Latitude,Longitude
93,Live Oak,6667,4003,0.600420,11556,0.5769,0.3464,28.351535,-98.126961
218,McMullen,941,555,0.589798,711,1.3235,0.7806,28.384922,-98.578853
90,Hall,1042,612,0.587332,3359,0.3102,0.1822,34.453189,-100.576343
253,Loving,7,4,0.571429,84,0.0833,0.0476,31.844936,-103.561229
202,Caldwell,3517,1952,0.555018,38127,0.0922,0.0512,29.840422,-97.631097
...,...,...,...,...,...,...,...,...,...
171,Titus,6163,549,0.089080,32419,0.1901,0.0169,33.214599,-94.966783
230,Hansford,795,70,0.088050,5599,0.1420,0.0125,36.272847,-101.356930
170,Camp,1041,78,0.074928,12398,0.0840,0.0063,32.974581,-94.979085
243,Terrell,229,17,0.074236,1011,0.2265,0.0168,30.232332,-102.072539


In [None]:
speeding_rate_df.rename(columns = {'total_stops':'speeding_stops','total_cites':'speeding_cites','citation_rate(stops)':'citation_rate(speeding_stops)','stop_rate(pop)':'speeding_stop_rate(pop)','citation_rate(pop)':'speeding_citation_rate(pop)'},inplace = True)
speeding_rate_df.drop(['Latitude','Longitude','population'], axis = 1,inplace=True)
rate_df=rate_df.merge(speeding_rate_df, left_on='county', right_on='county')
rate_df = rate_df.sort_values(by='citation_rate(stops)', ascending=False)
rate_df 

Unnamed: 0,county,total_stops,total_cites,citation_rate(stops),warning_rate(stops),population,stop_rate(pop),citation_rate(pop),Latitude,Longitude,speeding_stops,speeding_cites,citation_rate(speeding_stops),speeding_stop_rate(pop),speeding_citation_rate(pop)
0,Hall,1693,1039,0.613703,0.386297,3359,0.5040,0.3093,34.453189,-100.576343,1042,612,0.587332,0.3102,0.1822
1,Sutton,7792,4441,0.569944,0.430056,4062,1.9183,1.0933,30.517865,-100.505395,3387,1845,0.544730,0.8338,0.4542
2,Lipscomb,241,136,0.564315,0.435685,3283,0.0734,0.0414,36.280200,-100.272683,100,45,0.450000,0.0305,0.0137
3,Cameron,44213,24884,0.562821,0.437179,407630,0.1085,0.0610,26.102923,-97.478958,15354,7711,0.502214,0.0377,0.0189
4,Caldwell,5888,3245,0.551121,0.448879,38127,0.1544,0.0851,29.840422,-97.631097,3517,1952,0.555018,0.0922,0.0512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Hansford,1010,161,0.159406,0.840594,5599,0.1804,0.0288,36.272847,-101.356930,795,70,0.088050,0.1420,0.0125
250,Jeff Davis,3471,529,0.152406,0.847594,2344,1.4808,0.2257,30.617087,-104.187860,2142,218,0.101774,0.9138,0.0930
251,Stonewall,2195,324,0.147608,0.852392,1496,1.4672,0.2166,33.179580,-100.253807,1378,154,0.111756,0.9211,0.1029
252,Castro,3941,577,0.146410,0.853590,8126,0.4850,0.0710,34.533621,-102.258786,2060,115,0.055825,0.2535,0.0142


In [None]:
file_name = 'rate_geo_0314.xlsx'
path = "/gdrive/MyDrive/traffic_stop/TX-county/summarystat/"
save_path = file_name
rate_df .to_excel(save_path, index = False)

### 3. Stop Reasons

In [None]:
def inspect_violation(year):
  df = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data/traffic_' + str(year) + '.parquet')
  print(df['violation'].value_counts(dropna=False)[:30])
  top_vios = list(df['violation'].value_counts(dropna=False)[:30].index)
  violations = []
  citation_rates = []
  for vio in top_vios:
    temp_df = df.loc[df['violation']==vio,:]
    violations.append(vio)
    citation_rates.append(round(temp_df['citation_issued'].value_counts(dropna=False, normalize = True)[True],4))
  output_df = pd.DataFrame({'violation':violations,'citation_rate':citation_rates})
  return output_df.sort_values(by = 'citation_rate', ascending = False)

In [None]:
inspect_violation(2010)

Speeding Over Limit (#)                                                                      806271
Speeding-10% or More Above Posted Speed (#)                                                  300792
No/Improper License Plate Lamp                                                                77504
Operate Motor Vehicle Without License Plates (Or With One Plate)                              72817
No/Non-Compliant Head Lamps (#)                                                               47815
Ride, Not Secured By Safety Belt-Driver (12-1-85) (#)                                         33105
No Valid Inspection Certificate-Expired (#)                                                   30927
Improperly Placed Or Obstructed License Plate (#)                                             29410
Following Too Closely                                                                         28469
Display Expired License Plates/Registration (#)                                               22172


Unnamed: 0,violation,citation_rate
29,Speeding Over Limit (#)|Fail to Maintain Finan...,1.0
25,Speeding Over Limit (#)|No DL When Unlicensed-...,1.0
28,No DL When Unlicensed-Not CDL|Speeding-10% or ...,0.999
20,Fail to Maintain Financial Responsibility (#)|...,0.9851
1,Speeding-10% or More Above Posted Speed (#),0.9735
5,"Ride, Not Secured By Safety Belt-Driver (12-1-...",0.7278
15,"Ride, Not Secured By Safety Belt-Passenger (Wh...",0.6035
6,No Valid Inspection Certificate-Expired (#),0.363
9,Display Expired License Plates/Registration (#),0.2604
23,Disregard Stop Sign (#),0.2503


'Ride, Not Secured By Safety Belt-Driver (12-1-85) (#)'

'Ride, Not Secured By Safety Belt-Passenger (When Required)'

'Disregard Stop Sign (#)'

'No Valid Inspection Certificate-Expired (#)'

'Display Expired License Plates/Registration (#)'

'Use Of Vision Reducing Matter On Windows - Glass Coating Material'

In [None]:
inspect_violation(2015)

Speeding Over Limit (#)                                                                      529145
Speeding-10% or More Above Posted Speed (#)                                                  212943
Operate Motor Vehicle Without License Plates (Or With One Plate)                              51777
No/Non-Compliant Head Lamps (#)                                                               41910
No/Improper License Plate Lamp                                                                32274
Improperly Placed Or Obstructed License Plate (#)                                             25272
Inoperable/Obscured Stop Lamp(s) (#)                                                          20337
Display Expired License Plates/Registration (#)                                               19992
Ride, Not Secured By Safety Belt-Driver (12-1-85) (#)                                         18251
Drive in Left Lane When Not Passing or Where Prohibited                                       16645


Unnamed: 0,violation,citation_rate
16,Speeding Over Limit (#)|Driving While License ...,1.0
13,Speeding Over Limit (#)|No DL When Unlicensed-...,1.0
12,Failure to Pay Toll (For NTTA Use Only),0.9998
21,No DL When Unlicensed-Not CDL|Speeding-10% or ...,0.9987
24,Fail To Control Speed (#),0.9823
1,Speeding-10% or More Above Posted Speed (#),0.9658
27,Fail to Maintain Financial Responsibility (#)|...,0.9654
8,"Ride, Not Secured By Safety Belt-Driver (12-1-...",0.6075
17,"Ride, Not Secured By Safety Belt-Passenger (Wh...",0.6034
20,Disregard Stop Sign (#),0.256


Look at several violations's race composition

It seems that 

1.Ride, Not Secured By Safety Belt-Passenger (When Required)

2.Use Of Vision Reducing Matter On Windows - Glass Coating Material

Have high rate of citation for hispanics

In [None]:
vios = ['Ride, Not Secured By Safety Belt-Driver (12-1-85) (#)','Ride, Not Secured By Safety Belt-Passenger (When Required)',
       'Disregard Stop Sign (#)','No Valid Inspection Certificate-Expired (#)','Display Expired License Plates/Registration (#)',
        'Use Of Vision Reducing Matter On Windows - Glass Coating Material']
df = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data/traffic_' + str(2015) + '.parquet')
print(df['subject_race'].value_counts(normalize = True))
for vio in vios:
  print(vio)
  print(len(df.loc[df['violation']==vio,:]))
  temp_df = df.loc[(df['violation']==vio) & (df['citation_issued']==True),:]
  print(temp_df['subject_race'].value_counts(normalize = True))
  print('*********************')

white                     0.459361
hispanic                  0.387483
black                     0.097297
unknown                   0.035196
asian/pacific islander    0.017604
other                     0.003059
Name: subject_race, dtype: float64
Ride, Not Secured By Safety Belt-Driver (12-1-85) (#)
18251
white                     0.477406
hispanic                  0.358799
black                     0.108415
unknown                   0.038243
asian/pacific islander    0.013620
other                     0.003518
Name: subject_race, dtype: float64
*********************
Ride, Not Secured By Safety Belt-Passenger (When Required)
10438
hispanic                  0.574309
white                     0.263735
black                     0.096062
unknown                   0.039060
asian/pacific islander    0.020800
other                     0.006034
Name: subject_race, dtype: float64
*********************
Disregard Stop Sign (#)
9478
white                     0.473619
hispanic                  0.3887

### 4. Diversity Index

Note that when calculating the diversity index, we have groups: 'hispanic or Latino', and each subgroup under 'Not hispanic or latino' is treated as an individual group.

Simpson's Diversity Index:

$D = 1-\frac{\sum n(n-1)}{N(N-1)}$

In [None]:
def diversity_index(row):
  deno = row['total']*(row['total']-1)
  race_cols = [col for col in list(df_pop.columns) if col != 'county' and col != 'total']
  row = row[race_cols]
  numerator = np.sum(row.values*(row.values-1))
  #numerator = row['white']+row['black']+row['hispanic/latino']+row['asian/pacific']
  return 1-numerator/deno

In [None]:
#df2: for population excel:
counties = []
white = []
black = []
hispanic = []
asian_pacific = []
two_races = []
indian_alaska = []

# reformat pop files
pop2 = pd.read_excel('/content/pop_IShispanic_10.xlsx')
new_header = pop2.iloc[0] #grab the first row for the header
pop2.columns = new_header #set the header row as the df header
pop2 = pop2.iloc[3:12,:]

# reformat county names that are not null
orig_lst = [x for x in list(pop2.columns) if pd.isnull(x) == False]
county_lst = [x.replace(', Texas','') for x in list(pop2.columns) if pd.isnull(x) == False]
pop2 = pop2.rename(columns = dict(zip(orig_lst,county_lst)))
county_lst = county_lst[1:-1]
pop2['Statistics'] = pop2['Statistics'].replace({'Hispanic or Latino:':'hispanic','White Alone':'white','Black Alone':'black',
                                                     'Asian Alone':'asian/pacific islander',
                                                     'Native Hawaiian and Other Pacific Islander Alone':'asian/pacific islander'})
for county in county_lst:
  counties.append(county)
  white.append(pop2.loc[pop2['Statistics'] == 'white', county].iloc[0])
  black.append(pop2.loc[pop2['Statistics'] == 'black', county].iloc[0])
  hispanic.append(pop2.loc[pop2['Statistics'] == 'hispanic', county].iloc[0])
  two_races.append(pop2.loc[pop2['Statistics'] == 'Two or More Races', county].iloc[0])
  indian_alaska.append(pop2.loc[pop2['Statistics'] == 'American Indian and Alaska Native Alone', county].iloc[0])
  asian_pacific.append(pop2.loc[pop2['Statistics'] == 'asian/pacific islander', county].iloc[0]+pop2.loc[pop2['Statistics'] == 'asian/pacific islander', county].iloc[1])

df_pop = pd.DataFrame({'county':counties,'white':white,'black':black,'hispanic/latino':hispanic,'asian/pacific':asian_pacific,'multi-races':two_races,'indian_alaska':indian_alaska})
df_pop['county'] = df_pop['county'].map(lambda x: x.replace(' County',''))
#calculate diversity index

#df_pop['diversity'] =df_pop.apply(lambda row: )
race_cols = [col for col in list(df_pop.columns) if col != 'county']
df_pop['total'] = df_pop[race_cols].sum(axis = 1)
df_pop['diversity'] = df_pop.apply(lambda row: diversity_index(row), axis=1)
df_pop.sort_values(by = 'diversity', ascending = False)

Unnamed: 0,county,white,black,hispanic/latino,asian/pacific,multi-races,indian_alaska,total,diversity
78,Fort Bend,213749,124700,140387,101704,8541,1172,590253,0.737758
56,Dallas,784966,521976,910576,121233,26778,7464,2372993,0.692201
100,Harris,1356315,760229,1681495,257115,44123,8389,4107666,0.685112
236,Waller,19199,11064,12442,238,439,137,43519,0.658876
122,Jefferson,112412,84714,43228,8762,2590,749,252455,0.658493
...,...,...,...,...,...,...,...,...,...
252,Zapata,865,11,13151,31,12,17,14087,0.124704
253,Zavala,643,38,11019,6,14,9,11729,0.114395
161,Maverick,1585,79,52076,138,40,514,54432,0.083749
239,Webb,8548,497,240639,1363,228,95,251370,0.082367


In [None]:
rate_df = rate_df.merge(df_pop,left_on = 'county',right_on = 'county')
rate_df

Unnamed: 0,county,total_stops,total_cites,citation_rate(stops),warning_rate(stops),population,stop_rate(pop),citation_rate(pop),Latitude,Longitude,...,speeding_stop_rate(pop),speeding_citation_rate(pop),white,black,hispanic/latino,asian/pacific,multi-races,indian_alaska,total,diversity
0,Hall,1693,1039,0.613703,0.386297,3359,0.5040,0.3093,34.453189,-100.576343,...,0.3102,0.1822,1997,240,1092,2,16,12,3359,0.535874
1,Sutton,7792,4441,0.569944,0.430056,4062,1.9183,1.0933,30.517865,-100.505395,...,0.8338,0.4542,1603,7,2437,6,7,2,4062,0.484435
2,Lipscomb,241,136,0.564315,0.435685,3283,0.0734,0.0414,36.280200,-100.272683,...,0.0305,0.0137,2204,7,996,10,42,24,3283,0.457175
3,Cameron,44213,24884,0.562821,0.437179,407630,0.1085,0.0610,26.102923,-97.478958,...,0.0377,0.0189,43564,1205,359255,2625,590,391,407630,0.211790
4,Caldwell,5888,3245,0.551121,0.448879,38127,0.1544,0.0851,29.840422,-97.631097,...,0.0922,0.0512,16886,2463,17997,359,331,91,38127,0.576712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Hansford,1010,161,0.159406,0.840594,5599,0.1804,0.0288,36.272847,-101.356930,...,0.1420,0.0125,3075,31,2434,16,32,11,5599,0.509407
250,Jeff Davis,3471,529,0.152406,0.847594,2344,1.4808,0.2257,30.617087,-104.187860,...,0.9138,0.0930,1488,10,792,8,37,9,2344,0.482760
251,Stonewall,2195,324,0.147608,0.852392,1496,1.4672,0.2166,33.179580,-100.253807,...,0.9211,0.1029,1211,43,208,14,16,4,1496,0.324573
252,Castro,3941,577,0.146410,0.853590,8126,0.4850,0.0710,34.533621,-102.258786,...,0.2535,0.0142,3014,152,4884,34,13,29,8126,0.500865


In [None]:
file_name = 'rate_geo_0314.xlsx'
path = "/gdrive/MyDrive/traffic_stop/TX-county/summarystat/"
save_path = file_name
rate_df.to_excel(save_path, index = False)