In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [None]:
from google.colab import drive
import os
drive.mount('/gdrive/')

Mounted at /gdrive/


### Diversity Index

Note that when calculating the diversity index, we have groups: 'hispanic or Latino', and each subgroup under 'Not hispanic or latino' is treated as an individual group.

Simpson's Diversity Index:

$D = 1-\frac{\sum n(n-1)}{N(N-1)}$

The definition of Simpson's Diversity Index and Shannon's Index are very similar to the definitions of Gini Index and Entropy in Machine Learning(often used to measure purity/impority of a node in tree algorithms), respectively.

Change the diversity_index2 function below of Shannon index to math.log(x+0.0000000001, 2) to change the base from e to 2 to get entropy.

In [23]:
def diversity_index(row):
  deno = row['total']*(row['total']-1)
  race_cols = [col for col in list(df_pop.columns) if col != 'county' and col != 'total']
  row = row[race_cols]
  numerator = np.sum(row.values*(row.values-1))
  #numerator = row['white']+row['black']+row['hispanic/latino']+row['asian/pacific']
  return 1-numerator/deno

import math
def diversity_index2(row):
  race_cols = [col for col in list(df_pop.columns) if col != 'county' and col != 'total']
  row = row[race_cols]/row['total']
  ln_values = np.array([math.log(x+0.000000001) for x in row.values])
  return -np.sum(row.values*ln_values)

### For one year

In [None]:
#df2: for population excel:
counties = []
white = []
black = []
hispanic = []
asian_pacific = []
two_races = []
indian_alaska = []

# reformat pop files
pop2 = pd.read_excel('/content/pop_IShispanic_10.xlsx')
new_header = pop2.iloc[0] #grab the first row for the header
pop2.columns = new_header #set the header row as the df header
pop2 = pop2.iloc[3:12,:]

# reformat county names that are not null
orig_lst = [x for x in list(pop2.columns) if pd.isnull(x) == False]
county_lst = [x.replace(', Texas','') for x in list(pop2.columns) if pd.isnull(x) == False]
pop2 = pop2.rename(columns = dict(zip(orig_lst,county_lst)))
county_lst = county_lst[1:-1]
pop2['Statistics'] = pop2['Statistics'].replace({'Hispanic or Latino:':'hispanic','White Alone':'white','Black Alone':'black',
                                                     'Asian Alone':'asian/pacific islander',
                                                     'Native Hawaiian and Other Pacific Islander Alone':'asian/pacific islander'})
for county in county_lst:
  counties.append(county)
  white.append(pop2.loc[pop2['Statistics'] == 'white', county].iloc[0])
  black.append(pop2.loc[pop2['Statistics'] == 'black', county].iloc[0])
  hispanic.append(pop2.loc[pop2['Statistics'] == 'hispanic', county].iloc[0])
  two_races.append(pop2.loc[pop2['Statistics'] == 'Two or More Races', county].iloc[0])
  indian_alaska.append(pop2.loc[pop2['Statistics'] == 'American Indian and Alaska Native Alone', county].iloc[0])
  asian_pacific.append(pop2.loc[pop2['Statistics'] == 'asian/pacific islander', county].iloc[0]+pop2.loc[pop2['Statistics'] == 'asian/pacific islander', county].iloc[1])

df_pop = pd.DataFrame({'county':counties,'white':white,'black':black,'hispanic/latino':hispanic,'asian/pacific':asian_pacific,'multi-races':two_races,'indian_alaska':indian_alaska})
df_pop['county'] = df_pop['county'].map(lambda x: x.replace(' County',''))
#calculate diversity index

#df_pop['diversity'] =df_pop.apply(lambda row: )
race_cols = [col for col in list(df_pop.columns) if col != 'county']
df_pop['total'] = df_pop[race_cols].sum(axis = 1)
df_pop['diversity'] = df_pop.apply(lambda row: diversity_index(row), axis=1)
df_pop['diversity_shannon'] = df_pop.apply(lambda row: diversity_index2(row), axis=1)
df_pop.sort_values(by = 'diversity', ascending = False)

Unnamed: 0,county,white,black,hispanic/latino,asian/pacific,multi-races,indian_alaska,total,diversity,diversity_shannon
78,Fort Bend,213749,124700,140387,101704,8541,1172,590253,0.737758,1.414511
56,Dallas,784966,521976,910576,121233,26778,7464,2372993,0.692201,1.287255
100,Harris,1356315,760229,1681495,257115,44123,8389,4107666,0.685112,1.278528
236,Waller,19199,11064,12442,238,439,137,43519,0.658876,1.160330
122,Jefferson,112412,84714,43228,8762,2590,749,252455,0.658493,1.209779
...,...,...,...,...,...,...,...,...,...,...
252,Zapata,865,11,13151,31,12,17,14087,0.124704,0.268807
253,Zavala,643,38,11019,6,14,9,11729,0.114395,0.253945
161,Maverick,1585,79,52076,138,40,514,54432,0.083749,0.219298
239,Webb,8548,497,240639,1363,228,95,251370,0.082367,0.206683


In [None]:
# new 230811 code
region_df = pd.read_excel('/gdrive/MyDrive/traffic_stop/countyinfo.xlsx')

# transform column
df_pop = df_pop.join(region_df.set_index('county'), on='county')
df_pop.rename(columns={'region':'county_region'}, inplace=True)

In [None]:
# new 230811 code
file_name = 'rate_geo_0811.xlsx'
#path = "/gdrive/MyDrive/traffic_stop/"
path = '/content/'
save_path = file_name
df_pop.to_excel(save_path, index = False)

### For all years from 2010 to 2017

In [25]:
file_names = ['ACS2006_2010.xlsx','ACS2007_2011.xlsx','ACS2008_2012.xlsx','ACS2009_2013.xlsx',
              'ACS2010_2014.xlsx','ACS2011_2015.xlsx','ACS2012_2016.xlsx','ACS2013_2017.xlsx']

all_year_df = pd.DataFrame()

for file_name in file_names:
  counties = []
  white = []
  black = []
  hispanic = []
  asian_pacific = []
  two_races = []
  indian_alaska = []

  # reformat pop files
  #pop2 = pd.read_excel('/content/pop_IShispanic_10.xlsx')
  pop2 = pd.read_excel('/content/'+file_name)
  new_header = pop2.iloc[0] #grab the first row for the header
  pop2.columns = new_header #set the header row as the df header
  pop2 = pop2.iloc[3:13,:]

  # reformat county names that are not null
  orig_lst = [x for x in list(pop2.columns) if pd.isnull(x) == False]
  county_lst = [x.replace(', Texas','') for x in list(pop2.columns) if pd.isnull(x) == False]
  pop2 = pop2.rename(columns = dict(zip(orig_lst,county_lst)))
  county_lst = county_lst[1:-1]
  pop2['Statistics'] = pop2['Statistics'].replace({'Hispanic or Latino:':'hispanic','White Alone':'white','Black or African American Alone':'black',
                                                      'Asian Alone':'asian/pacific islander',
                                                      'Native Hawaiian and Other Pacific Islander Alone':'asian/pacific islander'})
  for county in county_lst:
    counties.append(county)
    white.append(pop2.loc[pop2['Statistics'] == 'white', county].iloc[0])
    black.append(pop2.loc[pop2['Statistics'] == 'black', county].iloc[0])
    hispanic.append(pop2.loc[pop2['Statistics'] == 'hispanic', county].iloc[0])
    two_races.append(pop2.loc[pop2['Statistics'] == 'Two or More Races', county].iloc[0])
    indian_alaska.append(pop2.loc[pop2['Statistics'] == 'American Indian and Alaska Native Alone', county].iloc[0])
    asian_pacific.append(pop2.loc[pop2['Statistics'] == 'asian/pacific islander', county].iloc[0]+pop2.loc[pop2['Statistics'] == 'asian/pacific islander', county].iloc[1])

  df_pop = pd.DataFrame({'county':counties,'white':white,'black':black,'hispanic/latino':hispanic,'asian/pacific':asian_pacific,'multi-races':two_races,'indian_alaska':indian_alaska})
  df_pop['county'] = df_pop['county'].map(lambda x: x.replace(' County',''))
  #calculate diversity index

  #df_pop['diversity'] =df_pop.apply(lambda row: )
  race_cols = [col for col in list(df_pop.columns) if col != 'county']
  df_pop['total'] = df_pop[race_cols].sum(axis = 1)
  df_pop['diversity'] = df_pop.apply(lambda row: diversity_index(row), axis=1)
  df_pop['diversity_shannon'] = df_pop.apply(lambda row: diversity_index2(row), axis=1)
  df_pop['year'] = file_name[8:12]
  df_pop.sort_values(by = 'diversity', ascending = False)
  all_year_df = pd.concat([all_year_df, df_pop], ignore_index=True)

all_year_df.to_csv('tx_county_race_pop.csv')

### Aug 11 2023

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_excel('/content/Traffic Stop Data Summary.xlsx')
data.head()

Unnamed: 0,State,Region,Stops,Time Start,Time End,Stop Location,Race,Sex,Violation,Search,Contraband,Citation,Citation Valid,Pedestrian Included
0,AR,Little Rock,13641,2016-12-01,2017-11-01,False,True,True,False,False,False,True,,False
1,AZ,Gilbert,480599,2017-12-01,2018-05-01,False,False,False,False,False,False,False,,True
2,AZ,Mesa,157015,2013-12-01,2019-09-01,False,True,True,True,False,False,True,,True
3,AZ,State Patrol,3494153,2009-01-01,2017-12-01,True,True,True,False,True,True,True,,True
4,CA,Anaheim,87876,2011-12-01,2017-03-01,False,False,False,False,False,False,False,,False


In [None]:
var_cols = ['Stop Location', 'Race', 'Sex', 'Violation', 'Citation']
valid_data = data.loc[data[var_cols].all(axis='columns')]
valid_data = valid_data.loc[~valid_data['Pedestrian Included']]
valid_data.drop('Citation Valid', axis = 1, inplace = True)
valid_data.to_csv('citation_data.csv')

In [None]:
var_cols = ['Stop Location', 'Race', 'Sex', 'Violation', 'Citation', 'Search', 'Contraband']
valid_data_strict = data.loc[data[var_cols].all(axis='columns')]
valid_data_strict = valid_data_strict.loc[~valid_data_strict['Pedestrian Included']]
valid_data_strict

Unnamed: 0,State,Region,Stops,Time Start,Time End,Stop Location,Race,Sex,Violation,Search,Contraband,Citation,Citation Valid,Pedestrian Included
20,CT,State Patrol,1175339,2013-09-01,2015-09-01,True,True,True,True,True,True,True,,False
28,IL,State Patrol,12748173,2011-12-01,2017-12-01,True,True,True,True,True,True,True,,False
36,MD,State Patrol,3587052,2006-12-01,2014-03-01,True,True,True,True,True,True,True,,False
69,TN,Nashville,3092351,2009-12-01,2019-03-01,True,True,True,True,True,True,True,,False
78,TX,State Patrol,27426840,2005-12-01,2017-12-01,True,True,True,True,True,True,True,,False
86,WI,State Patrol,1058816,2009-12-01,2016-05-01,True,True,True,True,True,True,True,,False


In [None]:
valid_data_strict.drop('Citation Valid', axis = 1, inplace = True)
valid_data_strict.to_csv('allTargets_data.csv')

### Gini Index

In [None]:
gini_df = pd.read_csv('gini_2012_5year.csv')
gini_df = gini_df[gini_df.columns.drop(list(gini_df.filter(regex='Margin')))]
gini_df.drop('Label (Grouping)', axis = 1, inplace = True)
gini_df = gini_df.melt(var_name="county", value_name="gini_index")
gini_df['county'] = gini_df['county'].map(lambda x: x[:-24])
gini_df

Unnamed: 0,county,gini_index
0,Anderson,0.4633
1,Andrews,0.5065
2,Angelina,0.4745
3,Aransas,0.4722
4,Archer,0.4212
...,...,...
249,Wood,0.4370
250,Yoakum,0.4320
251,Young,0.5074
252,Zapata,0.5047


In [None]:
region_df = pd.read_excel('/gdrive/MyDrive/traffic_stop/countyinfo.xlsx')

# transform column
gini_df = gini_df.join(region_df.set_index('county'), on='county')
gini_df.rename(columns={'region':'county_region'}, inplace=True)
gini_df

Unnamed: 0,county,gini_index,county_region
0,Anderson,0.4633,Upper East
1,Andrews,0.5065,West Texas
2,Angelina,0.4745,Southeast
3,Aransas,0.4722,South Texas
4,Archer,0.4212,Northwest
...,...,...,...
249,Wood,0.4370,Upper East
250,Yoakum,0.4320,High Plains
251,Young,0.5074,Northwest
252,Zapata,0.5047,South Texas


In [None]:
gini_df.to_csv('gini_texas_county.csv')

### Sanity check data for US

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_excel('/content/Traffic Stop Data Summary.xlsx')
data.head()

Unnamed: 0,State,Region,Stops,Time Start,Time End,Stop Location,Race,Sex,Violation,Search,Contraband,Citation,Citation Valid,Pedestrian Included
0,AR,Little Rock,13641,2016-12-01,2017-11-01,False,True,True,False,False,False,True,,False
1,AZ,Gilbert,480599,2017-12-01,2018-05-01,False,False,False,False,False,False,False,,True
2,AZ,Mesa,157015,2013-12-01,2019-09-01,False,True,True,True,False,False,True,,True
3,AZ,State Patrol,3494153,2009-01-01,2017-12-01,True,True,True,False,True,True,True,,True
4,CA,Anaheim,87876,2011-12-01,2017-03-01,False,False,False,False,False,False,False,,False


In [None]:
len(data.loc[data['Region'] == 'State Patrol'])

33