In [19]:
from datascience import *
import math
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

# Question: Are certain types of crime less likely to end in arrest in SF compared to the US?

# Code below does the following:
# 1. Groups together the over thirty categories of crime in the original table under broad categories for ease of analysis.
# 2. Hypothesis testing:
#   a) Null: Arrest rates for given category of crime in SF match arrest rates nationwide for that given category - stated
#            another way, a criminal is as likely to be arrested in SF as in the US for a given crime.
#      Alternative: The chance of a criminal being arrested in SF is lower than nationwide.    
#   b) Simulate probability distribution of test statistic (proportion of given crime which ends in arrests) under null.
#   c) Because the crimes table we work with is just a random sample of the total population of crime reports, we bootstrap
#      from the random sample and construct a 95% confidence interval which captures the true proportion of arrests for a given 
#      crime in the total population. This is done because we have to consider that the population parameter could be
#      anywhere from the low end of the confidence interval to the high end (with 95% confidence). In order to definitively 
#      prove criminals have a lower chance of being arrested in SF, we should consider the true proportion of being at the 
#      high end of the confidence interval, the scenario most favorable to the null hypothesis, so that we know if that value
#      fails the p-cutoff, we can categorically reject the null for lower values within that confidence interval too. 
#   d) Use P-value testing to see how likely it is that the right endpoint of the confidence interval and values below occur 
#      under the null, with a P-value cutoff of 5. 
#   e) Observe types of crime for which we reject the null. We can then say with 95% confidence that those types of crime 
#      see lower arrest rates in SF for reasons other than random variation of nationwide rates. 

# Note: for ease of analysis, we isolate our question to categories of crime for which national arrest rates are readily
# available, and we'll be using 2016 arrest data from statista.com, which is closely comparable to 2013 arrest data 
# provided by the FBI. Both sources are as follows:
# https://www.statista.com/statistics/194213/crime-clearance-rate-by-type-in-the-us/
# https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/offenses-known-to-law-enforcement/clearances/clearancetopic_final

# Random sample with which we'll be working, since the original is too large to easily process with over a million reports.  
clean = Table.read_table('SF_Crime_Heat_Map.csv').sample(10000)

In [2]:
# Function relabels the reported category of crime incidents as a new category 
def repurpose(old_category, new_category, clean):
    new_table = clean.where('Category', old_category).drop('Category').with_column('Category', new_category)
    clean = clean.where('Category', are.not_equal_to(old_category)).append(new_table)
    return clean

In [3]:
# Drops categories which were infractions, not strictly criminal, or difficult to categorize/unimportant to question (e.g. stolen property could either be a felony or misdemeanor)
clean = clean.where('Category', are.not_equal_to('MISSING PERSON')).where('Category', are.not_equal_to('SECONDARY CODES')).where('Category', are.not_equal_to('RUNAWAY')).where('Category', are.not_equal_to('NON-CRIMINAL')).where('Category', are.not_equal_to('OTHER OFFENSES')).where('Category', are.not_equal_to('WARRANTS')).where('Category', are.not_equal_to('SUICIDE')).where('Category', are.not_equal_to('STOLEN PROPERTY')).where('Category', are.not_equal_to('SUSPICIOUS OCC')).where('Category', are.not_equal_to('RECOVERED VEHICLE')).where('Category', are.not_equal_to('LOITERING')).where('Category', are.not_equal_to('KIDNAPPING'))

In [4]:
# Groups nonviolent misdemeanors under common label 'Nonviolent Misdemeanors' #
nv_misd = make_array('DRUNKENNESS', 'PROSTITUTION', 'DRUG/NARCOTIC', 'SEX OFFENSES, NON FORCIBLE', 'TRESPASS', 'VANDALISM')
nv_misd = np.append(nv_misd, make_array('LIQUOR LAWS', 'DISORDERLY CONDUCT', 'GAMBLING', 'BAD CHECKS', 'PORNOGRAPHY/OBSCENE MAT', 'TREA'))
for i in nv_misd:
    if i in clean.column('Category'):
        clean = repurpose(i, 'NONVIOLENT MISDEMEANORS', clean)

In [5]:
# Groups aggravated misdemeanors under common label 'Aggravated Misdemeanors' #
# We exclude larceny/theft, vehicle theft, and burglary, as we lack nationwide arrest data on those crimes specifically. 
agg_misd = make_array('FAMILY OFFENSES', 'DRIVING UNDER THE INFLUENCE', 'WEAPON LAWS')
for i in agg_misd:
    if i in clean.column('Category'):
        clean = repurpose(i, 'AGGRAVATED MISDEMEANORS', clean)

In [6]:
# Groups nonviolent felonies under common label 'Nonviolent Felonies' #
nv_felon = make_array('EXTORTION', 'BRIBERY', 'FORGERY/COUNTERFEITING', 'FRAUD', 'EMBEZZLEMENT')
for i in nv_felon:
    if i in clean.column('Category'):
        clean = repurpose(i, 'NONVIOLENT FELONIES', clean)

In [7]:
# Sample of what table currently looks like
clean.show(10)

Category,Resolution
LARCENY/THEFT,NONE
LARCENY/THEFT,NONE
LARCENY/THEFT,NONE
LARCENY/THEFT,"ARREST, BOOKED"
ASSAULT,"ARREST, BOOKED"
ASSAULT,"ARREST, CITED"
ASSAULT,"ARREST, BOOKED"
LARCENY/THEFT,NONE
LARCENY/THEFT,NONE
"SEX OFFENSES, FORCIBLE",NONE


In [8]:
# This table gives a snapshot of the composition of crime in SF. Per our note in the first cell, we won't be including categories
# for which nationwide arrest data isn't available, so crimes in nonviolent misdemeanors/felonies and aggravated misdemeanors
# will not be included in our analysis.
clean.group('Category').sort('count', descending=True).show()

Category,count
LARCENY/THEFT,2006
NONVIOLENT MISDEMEANORS,1375
ASSAULT,916
VEHICLE THEFT,589
BURGLARY,414
NONVIOLENT FELONIES,323
ROBBERY,274
AGGRAVATED MISDEMEANORS,122
"SEX OFFENSES, FORCIBLE",47
ARSON,16


In [9]:
# Dropping nonviolent misdemeanors/felonies, aggrvatated misdemeanors as mentioned above
clean = clean.where('Category', are.not_equal_to('NONVIOLENT MISDEMEANORS')).where('Category', are.not_equal_to('NONVIOLENT FELONIES')).where('Category', are.not_equal_to('AGGRAVATED MISDEMEANORS'))

In [10]:
# Current types of resolution to the crime
clean.group('Resolution').sort('count', descending=True)

Resolution,count
NONE,3426
"ARREST, BOOKED",561
"ARREST, CITED",119
UNFOUNDED,48
DISTRICT ATTORNEY REFUSES TO PROSECUTE,29
JUVENILE CITED,16
JUVENILE BOOKED,16
NOT PROSECUTED,14
COMPLAINANT REFUSES TO PROSECUTE,14
JUVENILE ADMONISHED,9


In [11]:
# For ease of data analysis, will organize resolutions into 'None' and 'Arrest', with cases where 
# accused is arrested or cited (handed to criminal in lieu of jail, with promise to appear in court for arraignment or fine)
# funneled into 'Arrest'. All other types of resolutions will be dropped. 

def repurpose_resolution(old_category, new_category, clean):
    new_table = clean.where('Resolution', old_category).drop('Resolution').with_column('Resolution', new_category)
    clean = clean.where('Resolution', are.not_equal_to(old_category)).append(new_table)
    return clean

arrest = make_array('ARREST, BOOKED', 'ARREST, CITED', 'JUVENILE BOOKED', 'JUVENILE CITED', 'PROSECUTED BY OUTSIDE AGENCY')
arrest = np.append(arrest, make_array('JUVENILE ADMONISHED', 'PROSECUTED FOR LESSER OFFENSE', 'JUVENILE DIVERTED'))
for i in arrest:
    if i in clean.column('Resolution'):
        clean = repurpose_resolution(i, 'ARREST', clean)

In [12]:
# Dropping other resolutions, per above
clean = clean.where('Resolution', are.not_equal_to('UNFOUNDED')).where('Resolution', are.not_equal_to('NOT PROSECUTED')).where('Resolution', are.not_equal_to('DISTRICT ATTORNEY REFUSES TO PROSECUTE')).where('Resolution', are.not_equal_to('COMPLAINANT REFUSES TO PROSECUTE')).where('Resolution', are.not_equal_to('EXCEPTIONAL CLEARANCE')).where('Resolution', are.not_equal_to('PSYCHOPATHIC CASE')).where('Resolution', are.not_equal_to('CLEARED-CONTACT JUVENILE FOR MORE INFO'))

In [13]:
# Cleaning complete. Hypothesis testing next.
clean.group('Category').sort('count', descending=True)

Category,count
LARCENY/THEFT,1976
ASSAULT,881
VEHICLE THEFT,564
BURGLARY,411
ROBBERY,267
"SEX OFFENSES, FORCIBLE",35
ARSON,15


In [14]:
# We take the example of assault and walk it through the process. Nationwide, assault crimes result in arrests 53.3% of the
# time, and so we simulate under the null the proportion of assaults that end in arrests. Then we construct a confidence 
# interval to capture the true proportion of assault crimes that end in arrest, and see how likely it is that the upper end 
# of the confidence interval and values below occur under the null hypothesis.
# Our test statistic is the proportion of a given category of crime that ends in arrest.

def under_null(nationwide_proportion, crime_category):
    crime_count = clean.group('Category').where('Category', crime_category).column('count')
    model_proportions = [float(nationwide_proportion), 1 - float(nationwide_proportion)]
    collection = make_array()
    for i in range(10000):
        collection = np.append(collection, sample_proportions(crime_count, model_proportions).item(0))
    return Table().with_column('Proportions', collection)

In [15]:
# Bootstrap process for capturing with 95% confidence what proportion of assaults end in arrest in the original population. 
def bootstrap_prop(label):
    proportions = make_array()
    for i in range(1000):
        bootstrap_proportion = clean.sample().where('Category', label).where('Resolution', 'ARREST').num_rows/clean.sample().where('Category', label).num_rows
        proportions = np.append(proportions, bootstrap_proportion)
    return make_array(percentile(2.5, proportions), percentile(97.5, proportions))

In [None]:
# This tells us the likelihood that the right end of the confidence interval and values below occur due to random chance
# under the null hypothesis, for assault crimes. An answer of 0 means there is a 0% possibility that the values within the 
# confidence interval occur due to random variation under the null.
np.count_nonzero(under_null(0.533, 'ASSAULT').column(0) <= bootstrap_prop('ASSAULT').item(1)) / 10000

In [17]:
# Repeat the process for all other categories of crime, and collect likelihoods in table for easy comparison. 
props_and_crimes = make_array(make_array(0.365, 'SEX OFFENSES, FORCIBLE'), make_array(0.204, 'LARCENY/THEFT'), make_array(0.296, 'ROBBERY'), make_array(0.533, 'ASSAULT'), make_array(0.133, 'VEHICLE THEFT'), make_array(0.131, 'BURGLARY'), make_array(0.208, 'ARSON'))
collector = make_array()
for i in props_and_crimes:
    collector = np.append(collector, np.count_nonzero(under_null(i.item(0), i.item(1)).column(0) <= bootstrap_prop(i.item(1)).item(1)) /10000)
collector

array([ 0.9998,  0.    ,  0.3802,  0.    ,  0.    ,  1.    ,  0.9784])

In [18]:
# Final table. We reject the null for larceny/theft, assault, and vehicle theft, and accept the null for the rest. 
Table().with_column('Category of Crime', make_array('SEX OFFENSES, FORCIBLE', 'LARCENY/THEFT', 'ROBBERY', 'ASSAULT', 'VEHICLE THEFT', 'BURGLARY', 'ARSON'), 'P-Value', collector) 

Category of Crime,P-Value
"SEX OFFENSES, FORCIBLE",0.9998
LARCENY/THEFT,0.0
ROBBERY,0.3802
ASSAULT,0.0
VEHICLE THEFT,0.0
BURGLARY,1.0
ARSON,0.9784


Conclusion: 
Hypothesis testing says with 95% confidence that criminals commiting larceny/theft, assault, or vehicle theft are definitively less likely to be arrested in SF than nationwide. Hypothesis testing for other types of crime is inconclusive, because the distribution of the true proportion of arrested criminals for those types of crimes in SF overlaps significantly with the bootstrapped distribution of the nationwide arrest rates, meaning we cannot reject the null. Thus, our answer is that for larceny/theft, assault, and vehicle theft, criminals are less likely to be arrested in SF than nationwide. Other types of crime are inconclusive according to our model. 