In [1]:
import numpy as np
import pandas as pd


    What code sections are most frequent?
    Which ones most often lead to convictions?
    Which ones have the most severe racial disparities?
    In what localities (fips) are these disparities most severe?


In [2]:
cases = pd.read_csv('data100k.csv')
cases.head(3).T

Unnamed: 0,0,1,2
person_id,102090000000110,343221000000125,343221000000125
HearingDate,2019-02-28,2009-12-07,2011-01-20
CodeSection,A.46.2-862,B.46.2-301,A.46.2-707
codesection,covered elsewhere,covered elsewhere,covered elsewhere
ChargeType,Misdemeanor,Misdemeanor,Misdemeanor
chargetype,Misdemeanor,Misdemeanor,Misdemeanor
Class,1,1,3
DispositionCode,Guilty,Guilty,Guilty
disposition,Conviction,Conviction,Conviction
Plea,,,


In [3]:
cases['CodeSection'].value_counts()

CodeSection
A.46.2-862         26379
B.46.2-301         25967
46.2-300           17934
C.46.2-862         11728
18.2-250.1         10573
                   ...  
54.1-3805              1
27-97 9-22 901.        1
9999999999999          1
Z.18.2-71              1
4VAC20-610-30B         1
Name: count, Length: 4207, dtype: int64

In [4]:
cases['DispositionCode'].value_counts()

DispositionCode
Guilty                     156563
Nolle Prosequi              54680
Dismissed                   42520
Guilty In Absentia          31958
Not Guilty                   5807
Not Guilty/Acquitted         1623
Not True Bill                 250
No Indictment Presented       178
Dismissed/Other                19
Name: count, dtype: int64

In [5]:
cases['conviction'] = [x in ['Guilty', 'Guilty In Absentia'] for x in cases['DispositionCode']]

In [6]:
cases.head(3).T

Unnamed: 0,0,1,2
person_id,102090000000110,343221000000125,343221000000125
HearingDate,2019-02-28,2009-12-07,2011-01-20
CodeSection,A.46.2-862,B.46.2-301,A.46.2-707
codesection,covered elsewhere,covered elsewhere,covered elsewhere
ChargeType,Misdemeanor,Misdemeanor,Misdemeanor
chargetype,Misdemeanor,Misdemeanor,Misdemeanor
Class,1,1,3
DispositionCode,Guilty,Guilty,Guilty
disposition,Conviction,Conviction,Conviction
Plea,,,


In [7]:
#Highest conviction rates by code section

convict_rate = cases.groupby('CodeSection').agg({'conviction': ['count', 'mean']})
convict_rate = convict_rate.reset_index()
convict_rate.columns = ['CodeSection', 'count', 'mean']
convict_rate = convict_rate.query("count > 50")
convict_rate = convict_rate.sort_values('mean', ascending=False)
convict_rate

Unnamed: 0,CodeSection,count,mean
1806,23-55,55,0.981818
1755,23-22.1(A),131,0.954198
2103,29-17(C),70,0.942857
3961,A.46.2-862,26379,0.929414
3992,B.18.2-266,1879,0.905269
...,...,...,...
1436,19.2-123,125,0.096000
1102,18.2-374.1:1(A),112,0.089286
1450,19.2-135,83,0.084337
1433,19.2-100,238,0.000000


In [8]:
#Which ones have the most severe racial disparities?
cases['Race'].unique()

array(['Black(Non-Hispanic)', 'Hispanic', 'White Caucasian(Non-Hispanic)',
       'MISSING', 'Asian Or Pacific Islander', 'Black (Non-Hispanic)',
       'White Caucasian (Non-Hispanic)',
       'Other(Includes Not Applicable.. Unknown)',
       'Other (Includes Not Applicable.. Unknown)', 'Black', 'White',
       'Unknown (Includes Not Applicable.. Unknown)', 'American Indian',
       'Unknown', 'Asian or Pacific Islander',
       'American Indian Or Alaskan Native'], dtype=object)

In [9]:
replace_map = {'Black(Non-Hispanic)':'Black (Non-Hispanic)', 
               'Hispanic':'Hispanic', 
               'White Caucasian(Non-Hispanic)':'White (Non-Hispanic)',
               'MISSING':'Other or Missing', 
               'Asian Or Pacific Islander':'Asian or Pacific Islander', 
               'Black (Non-Hispanic)':'Black (Non-Hispanic)',
               'White Caucasian (Non-Hispanic)':'White (Non-Hispanic)',
               'Other(Includes Not Applicable.. Unknown)':'Other or Missing',
               'Other (Includes Not Applicable.. Unknown)':'Other or Missing', 
               'Black':'Black (Non-Hispanic)', 
               'White':'White (Non-Hispanic)',
               'Unknown (Includes Not Applicable.. Unknown)':'Other or Missing', 
               'American Indian':'American Indian or Alaskan Native',
               'Unknown':'Other or Missing', 
               'Asian or Pacific Islander':'Asian or Pacific Islander',
               'American Indian Or Alaskan Native':'American Indian or Alaskan Native'}
cases['Race'] = cases['Race'].replace(replace_map)
cases['Race'].value_counts()

Race
White (Non-Hispanic)                 159627
Black (Non-Hispanic)                 115627
Hispanic                               9319
Other or Missing                       5928
Asian or Pacific Islander              2794
American Indian or Alaskan Native       303
Name: count, dtype: int64

In [10]:
convict_rate_race = cases.groupby('Race').agg({'conviction': ['count', 'mean']})
convict_rate_race

Unnamed: 0_level_0,conviction,conviction
Unnamed: 0_level_1,count,mean
Race,Unnamed: 1_level_2,Unnamed: 2_level_2
American Indian or Alaskan Native,303,0.785479
Asian or Pacific Islander,2794,0.662491
Black (Non-Hispanic),115627,0.632638
Hispanic,9319,0.830347
Other or Missing,5928,0.739879
White (Non-Hispanic),159627,0.633715


In [11]:
convict_rate_race_cs = cases.groupby(['CodeSection', 'Race']).agg({'conviction': ['count', 'mean']})
convict_rate_race_cs = convict_rate_race_cs.reset_index()
convict_rate_race_cs.columns = ['CodeSection', 'Race', 'count', 'convictrate']

In [12]:
convict_rate_race_cs = convict_rate_race_cs.query("count > 30")

In [13]:
convict_rate_race_cs = convict_rate_race_cs.drop('count', axis=1)
convict_rate_race_cs

Unnamed: 0,CodeSection,Race,convictrate
4,1-12,Black (Non-Hispanic),0.435484
75,10-42,White (Non-Hispanic),0.395349
76,10-43,Black (Non-Hispanic),0.170732
78,10-43,White (Non-Hispanic),0.353659
99,10-62,Black (Non-Hispanic),0.212121
...,...,...,...
6558,NO DMV,Black (Non-Hispanic),0.640000
6561,NO DMV,White (Non-Hispanic),0.608911
6620,Z.18.2-47,Black (Non-Hispanic),0.363636
6633,Z.18.2-91,Black (Non-Hispanic),0.725191


In [14]:
convict_rate_wide = pd.pivot_table(convict_rate_race_cs,
                                   index = 'CodeSection', 
                                   columns = 'Race', 
                                   values = 'convictrate')
convict_rate_wide

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,Other or Missing,White (Non-Hispanic)
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-12,,,0.435484,,,
10-42,,,,,,0.395349
10-43,,,0.170732,,,0.353659
10-62,,,0.212121,,,0.228261
13-1-5,,,0.578125,,,0.658537
...,...,...,...,...,...,...
G.46.2-870,,,,,,0.593750
MISSING,,,0.589147,,,0.307692
NO DMV,,,0.640000,,,0.608911
Z.18.2-47,,,0.363636,,,


In [15]:
convict_rate_wide['black_white_diff'] = convict_rate_wide['Black (Non-Hispanic)'] - convict_rate_wide['White (Non-Hispanic)']
convict_rate_wide.sort_values('black_white_diff', ascending=False)

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,Other or Missing,White (Non-Hispanic),black_white_diff
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MISSING,,,0.589147,,,0.307692,0.281455
23-10,,,0.448276,,,0.213592,0.234684
46.2-752,,,0.690647,,,0.492813,0.197834
19.2-128(B),,,0.660714,,,0.482143,0.178571
14.2-81,,,0.676190,,,0.500000,0.176190
...,...,...,...,...,...,...,...
D.18.2-266,,,,,,0.763889,
D.46.2-894,,,,,,0.678571,
G.18.2-266,,,,,,0.909091,
G.46.2-870,,,,,,0.593750,


In [24]:
# Pull census data from Weldon Cooper for Public Service 

race_url = 'https://virginia.box.com/shared/static/i8i5onrkveks849pkky0gwgxlax8d8fe.xlsx'
hisp_url = 'https://virginia.box.com/shared/static/fegrn0p0igzl95snji3ku6edwu0hy3dj.xlsx'

race_pop = pd.read_excel(race_url, skiprows=[0,1,2,3,5,6,7])
race_pop = race_pop[['FIPS', 'Jurisdiction', 'Total Population', 'American Indian', 'Asian ', 'Black',
       'Pacific Islander', 'Two or more races','White ']]
race_pop

  warn(f"Print area cannot be set to Defined name: {defn.value}.")


Unnamed: 0,FIPS,Jurisdiction,Total Population,American Indian,Asian,Black,Pacific Islander,Two or more races,White
0,1,Accomack County,33246,441,370,9859,79,609,23125
1,3,Albemarle County,113535,1069,7925,12581,242,3210,95210
2,5,Alleghany County,14986,126,92,906,17,283,14136
3,7,Amelia County,13268,177,138,2759,15,259,10445
4,9,Amherst County,31273,495,339,6475,55,828,24796
...,...,...,...,...,...,...,...,...,...
128,800,Suffolk city,96194,1052,3115,43068,232,2975,51977
129,810,Virginia Beach city,457672,6241,44491,104827,1846,20837,324018
130,820,Waynesboro city,22550,309,522,3665,35,792,18840
131,830,Williamsburg city,15590,202,1164,2754,37,591,12055


In [30]:
hisp_pop = pd.read_excel(hisp_url, skiprows=[0,1,2,3,5,6,7,8,9])
hisp_pop = hisp_pop[['FIPS', 'Unnamed: 6']]
hisp_pop = hisp_pop.rename(columns={'Unnamed: 6': 'Hispanic'})
hisp_pop.head(10)

  warn(f"Print area cannot be set to Defined name: {defn.value}.")


Unnamed: 0,FIPS,Hispanic
0,1,3170
1,3,6750
2,5,265
3,7,507
4,9,849
5,11,360
6,13,36284
7,15,2849
8,17,101
9,19,2179


In [31]:
# The big difference between pandas merging and SQL joins is: SQL is usually cleaner because of Codd's nomral form rules. Pandas? Anything goes.

# Two problems:
# 1. You might expect a one-to-one merge, but the data are messy and it's many-to-one or many-to-many
# 2. You expect row to match, and they don'try

# Merge 1: checking for problems
pop = pd.merge(race_pop, hisp_pop, 
               on='FIPS', 
               how='outer', 
               validate='one_to_one',
               indicator='matched')

In [32]:
pop['matched'].value_counts()

matched
both          133
left_only       0
right_only      0
Name: count, dtype: int64

In [44]:
#Merge 2: if there aren't problems, run a simpler merge
pop = pd.merge(race_pop, hisp_pop, on='FIPS', how='inner')
pop = pop.rename({'FIPS': 'fips', 
                  'Jurisdiction': 'jurisdiction', 
                  'Total Population': 'total_pop', 
                  'American Indian': 'amerind_pop', 
                  'Asian ': 'asian_pop',
                  'Black': 'black_pop', 
                  'Pacific Islander': 'pacific_pop', 
                  'Two or more races': 'twomore_pop', 
                  'White ': 'white_pop', 
                  'Hispanic': 'hisp_pop'}, axis=1)

In [36]:
convictions = cases.query("conviction == True")

In [42]:
convictions_race = convictions.groupby(['CodeSection', 'fips', 'Race']).size().reset_index()
convictions_race = convictions_race.rename(columns={0: 'count'})
convictions_race

Unnamed: 0,CodeSection,fips,Race,count
0,01-2007,51,White (Non-Hispanic),1
1,1,550,Black (Non-Hispanic),3
2,1,550,White (Non-Hispanic),1
3,1-12,650,Black (Non-Hispanic),27
4,1-12,650,White (Non-Hispanic),6
...,...,...,...,...
27500,Z.18.2-91,840,White (Non-Hispanic),2
27501,Z.18.2-91; 26,700,Black (Non-Hispanic),1
27502,Z.18.2-95,67,Black (Non-Hispanic),1
27503,Z.18.2-95,83,Black (Non-Hispanic),1


In [48]:
convictions_race_wide = pd.pivot_table(convictions_race, 
                                       index=['CodeSection', 'fips'],
                                       values='count',
                                       columns='Race', fill_value=0).reset_index()
convictions_race_wide

Race,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,Other or Missing,White (Non-Hispanic)
0,01-2007,51,0.0,0.0,0.0,0.0,0.0,1.0
1,1,550,0.0,0.0,3.0,0.0,0.0,1.0
2,1-12,650,0.0,0.0,27.0,0.0,0.0,6.0
3,1-200,29,0.0,0.0,1.0,0.0,0.0,0.0
4,1-200,105,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
18700,Z.18.2-91,840,0.0,0.0,0.0,0.0,0.0,2.0
18701,Z.18.2-91; 26,700,0.0,0.0,1.0,0.0,0.0,0.0
18702,Z.18.2-95,67,0.0,0.0,1.0,0.0,0.0,0.0
18703,Z.18.2-95,83,0.0,0.0,1.0,0.0,0.0,0.0


In [49]:
convict_census = pd.merge(convictions_race_wide, pop, 
                          on='fips', 
                          how='outer',
                          indicator='matched',
                          validate='many_to_one')

In [50]:
convict_census['matched'].value_counts()

matched
both          17691
left_only      1014
right_only        8
Name: count, dtype: int64

In [51]:
convict_census.query("matched == 'right_only'")

Unnamed: 0,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,Other or Missing,White (Non-Hispanic),jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacific_pop,twomore_pop,white_pop,hisp_pop,matched
5918,,95,,,,,,,James City County,79882.0,903.0,3289.0,12333.0,267.0,2425.0,65745.0,5290.0,right_only
13277,,580,,,,,,,Covington city,5717.0,61.0,82.0,874.0,6.0,156.0,4851.0,153.0,right_only
14231,,660,,,,,,,Harrisonburg city,51430.0,830.0,2987.0,5846.0,212.0,1796.0,43415.0,10790.0,right_only
14365,,678,,,,,,,Lexington city,7456.0,76.0,296.0,723.0,18.0,206.0,6556.0,294.0,right_only
14575,,683,,,,,,,Manassas city,42708.0,1034.0,3338.0,7686.0,204.0,1660.0,32201.0,16796.0,right_only
14576,,685,,,,,,,Manassas Park city,17002.0,488.0,2235.0,2990.0,78.0,651.0,11882.0,7311.0,right_only
15786,,720,,,,,,,Norton city,3666.0,35.0,88.0,289.0,4.0,90.0,3341.0,144.0,right_only
15962,,735,,,,,,,Poquoson city,12574.0,148.0,428.0,335.0,19.0,306.0,11953.0,461.0,right_only


In [52]:
convict_census.query("matched == 'left_only'")['fips'].drop_duplicates()

7416     122
13122    560
14922    701
15074    702
15486    711
15703    712
16532    761
16686    762
16834    764
Name: fips, dtype: int64

In [54]:
convict_census['total_convictions'] = convict_census['American Indian or Alaskan Native'] + convict_census['Asian or Pacific Islander'] + convict_census['Black (Non-Hispanic)'] + convict_census['White (Non-Hispanic)']

In [None]:
convict_census = convict_census.query("total_convictions > 50")


Unnamed: 0,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,Other or Missing,White (Non-Hispanic),jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacific_pop,twomore_pop,white_pop,hisp_pop,matched,total_convictions
85,46.2-300,1,1.0,0.0,28.0,92.0,4.0,32.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,61.0
136,C.46.2-862,1,0.0,1.0,39.0,3.0,2.0,52.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,92.0
247,46.2-300,3,0.0,0.0,9.0,55.0,0.0,61.0,Albemarle County,113535.0,1069.0,7925.0,12581.0,242.0,3210.0,95210.0,6750.0,both,70.0
281,A.18.2-266,3,0.0,0.0,9.0,3.0,0.0,44.0,Albemarle County,113535.0,1069.0,7925.0,12581.0,242.0,3210.0,95210.0,6750.0,both,53.0
292,A.46.2-862,3,0.0,4.0,26.0,8.0,4.0,78.0,Albemarle County,113535.0,1069.0,7925.0,12581.0,242.0,3210.0,95210.0,6750.0,both,108.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18596,18.2-250.1,840,0.0,0.0,10.0,1.0,1.0,49.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,59.0
18613,18.2-388,840,0.0,1.0,19.0,3.0,3.0,177.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,197.0
18654,46.2-300,840,0.0,0.0,14.0,90.0,1.0,45.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,59.0
18681,A.18.2-266,840,1.0,1.0,8.0,2.0,1.0,51.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,61.0


In [64]:
convict_census['black_overrep_index'] = (convict_census['Black (Non-Hispanic)']/convict_census['total_convictions']) / (convict_census['black_pop']/convict_census['total_pop'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  convict_census['black_overrep_index'] = (convict_census['Black (Non-Hispanic)']/convict_census['total_convictions']) / (convict_census['black_pop']/convict_census['total_pop'])


In [61]:
convict_census

Unnamed: 0,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,Other or Missing,White (Non-Hispanic),jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacific_pop,twomore_pop,white_pop,hisp_pop,matched,total_convictions,black_overrep_index
85,46.2-300,1,1.0,0.0,28.0,92.0,4.0,32.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,61.0,1.547871
136,C.46.2-862,1,0.0,1.0,39.0,3.0,2.0,52.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,92.0,1.429497
247,46.2-300,3,0.0,0.0,9.0,55.0,0.0,61.0,Albemarle County,113535.0,1069.0,7925.0,12581.0,242.0,3210.0,95210.0,6750.0,both,70.0,1.160270
281,A.18.2-266,3,0.0,0.0,9.0,3.0,0.0,44.0,Albemarle County,113535.0,1069.0,7925.0,12581.0,242.0,3210.0,95210.0,6750.0,both,53.0,1.532432
292,A.46.2-862,3,0.0,4.0,26.0,8.0,4.0,78.0,Albemarle County,113535.0,1069.0,7925.0,12581.0,242.0,3210.0,95210.0,6750.0,both,108.0,2.172522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18596,18.2-250.1,840,0.0,0.0,10.0,1.0,1.0,49.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,59.0,1.220270
18613,18.2-388,840,0.0,1.0,19.0,3.0,3.0,177.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,197.0,0.694377
18654,46.2-300,840,0.0,0.0,14.0,90.0,1.0,45.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,59.0,1.708377
18681,A.18.2-266,840,1.0,1.0,8.0,2.0,1.0,51.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,61.0,0.944209


In [63]:
convict_census.sort_values('black_overrep_index', ascending=False).head(15)

Unnamed: 0,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,Other or Missing,White (Non-Hispanic),jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacific_pop,twomore_pop,white_pop,hisp_pop,matched,total_convictions,black_overrep_index
2249,46.2-300,35,1.0,2.0,35.0,54.0,20.0,59.0,Carroll County,29048.0,275.0,141.0,530.0,21.0,436.0,28529.0,1250.0,both,97.0,19.775919
2293,A.46.2-862,35,0.0,0.0,28.0,6.0,21.0,83.0,Carroll County,29048.0,275.0,141.0,530.0,21.0,436.0,28529.0,1250.0,both,111.0,13.825327
10492,46.2-300,173,0.0,0.0,27.0,33.0,3.0,36.0,Smyth County,29477.0,187.0,227.0,969.0,24.0,369.0,28451.0,693.0,both,63.0,13.037152
2302,C.46.2-862,35,1.0,9.0,95.0,27.0,81.0,306.0,Carroll County,29048.0,275.0,141.0,530.0,21.0,436.0,28529.0,1250.0,both,411.0,12.668411
11721,A.46.2-862,191,1.0,5.0,28.0,21.0,12.0,116.0,Washington County,53635.0,315.0,431.0,1110.0,38.0,547.0,52296.0,893.0,both,150.0,9.0197
10189,46.2-300,169,0.0,0.0,9.0,10.0,0.0,65.0,Scott County,21419.0,151.0,99.0,301.0,24.0,223.0,21072.0,342.0,both,74.0,8.65453
12127,46.2-300,197,0.0,0.0,38.0,28.0,1.0,77.0,Wythe County,28178.0,194.0,251.0,1101.0,20.0,452.0,27072.0,413.0,both,115.0,8.45685
9742,18.2-250,165,0.0,0.0,16.0,1.0,2.0,35.0,Rockingham County,84394.0,930.0,1250.0,3186.0,94.0,1440.0,80404.0,6904.0,both,51.0,8.310279
5214,46.2-1172,85,0.0,0.0,81.0,1.0,0.0,13.0,Hanover County,111603.0,1106.0,3178.0,11853.0,235.0,2337.0,97666.0,3913.0,both,94.0,8.113435
11738,C.46.2-862,191,0.0,0.0,10.0,2.0,3.0,54.0,Washington County,53635.0,315.0,431.0,1110.0,38.0,547.0,52296.0,893.0,both,64.0,7.549972
