# Taxonomy

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/2017.csv')
taxonomy = df[['stat','stat_desc','category']]\
    .groupby(['stat','stat_desc','category'])\
    .count()\
    .reset_index()\
    .sort_values('stat')

taxonomy.head()

Unnamed: 0,stat,stat_desc,category
0,11,criminal homicide: murder,criminal homicide
1,21,"rape, forcible: rape by force, female 18 yrs +",forcible rape
2,22,"rape, forcible: attempt rape by force, female ...",forcible rape
3,23,"rape by force, female under 18",forcible rape
4,24,"attempt forcible rape, female under 18",forcible rape


Categorization changes through the years, new sub-categories arrive, other drop; we need them all merged together:

In [3]:
print('Year distinct `STAT` values count:')
for Y in range(2005, 2017):
    df = pd.read_csv('data/{}.csv'.format(Y))[['stat','stat_desc','category']]\
        .groupby(['stat','stat_desc','category'])\
        .count()\
        .reset_index()
    print(Y, len(df))
    # merge with year-suffix
    taxonomy = taxonomy.join(df.set_index('stat'), on = 'stat', how = 'outer',
                             lsuffix = '', rsuffix = '_{}'.format(Y))
    # make sure the main columns have all the values set: combined all the years through
    taxonomy.loc[taxonomy['stat_desc'].isnull(),'stat_desc'] =\
        taxonomy.loc[taxonomy['stat_desc'].isnull(),'stat_desc_{}'.format(Y)]
    taxonomy.loc[taxonomy['category'].isnull(),'category'] =\
        taxonomy.loc[taxonomy['category'].isnull(),'category_{}'.format(Y)]

print(2017, len(taxonomy))

Year distinct `STAT` values count:
2005 249
2006 249
2007 249
2008 249
2009 247
2010 244
2011 241
2012 247
2013 244
2014 239
2015 246
2016 328
2017 370


In [4]:
print('Joined: {}'.format(len(taxonomy)))
taxonomy.tail()

Joined: 370


Unnamed: 0,stat,stat_desc,category,stat_desc_2005,category_2005,stat_desc_2006,category_2006,stat_desc_2007,category_2007,stat_desc_2008,...,stat_desc_2012,category_2012,stat_desc_2013,category_2013,stat_desc_2014,category_2014,stat_desc_2015,category_2015,stat_desc_2016,category_2016
329,701,non-criminal: alarm-robbery,miscellaneous non-criminal,,,,,,,,...,,,,,,,,,non-criminal: alarm-robbery,miscellaneous non-criminal
329,703,non-criminal: fire alarm,miscellaneous non-criminal,,,,,,,,...,,,,,,,,,non-criminal: fire alarm,miscellaneous non-criminal
329,722,"vehicle/boating laws, citations: parking",miscellaneous non-criminal,,,,,,,,...,,,,,,,,,"vehicle/boating laws, citations: parking",miscellaneous non-criminal
329,739,"vehicle/boat, other non-criminal: right capsiz...",miscellaneous non-criminal,,,,,,,,...,,,,,,,,,"vehicle/boat, other non-criminal: right capsiz...",miscellaneous non-criminal
329,777,assist citizen -notification of high risk sex ...,miscellaneous non-criminal,,,,,,,,...,,,,,,,,,assist citizen -notification of high risk sex ...,miscellaneous non-criminal


In [5]:
len(taxonomy)

370

In [6]:
taxonomy = taxonomy[['stat','stat_desc','category']]

In [7]:
def collapse(x):
    """short category: single word"""

    if x == 'criminal homicide':
        return 'homicide'
    if x == 'forcible rape':
        return 'rape'
    if x == 'grand theft auto':
        return 'gta'
    if x[:13] == 'drunk driving':
        return 'dui'
    if x == 'larceny theft':
        return 'theft'
    if x == 'offenses against family':
        return 'family'
    if 'assault' in x:
        return 'assault'
    return x.split()[0]


taxonomy['short_cat'] = taxonomy['category'].apply(collapse)
taxonomy.head()

Unnamed: 0,stat,stat_desc,category,short_cat
0,11,criminal homicide: murder,criminal homicide,homicide
1,21,"rape, forcible: rape by force, female 18 yrs +",forcible rape,rape
2,22,"rape, forcible: attempt rape by force, female ...",forcible rape,rape
3,23,"rape by force, female under 18",forcible rape,rape
4,24,"attempt forcible rape, female under 18",forcible rape,rape


Let us set some useful indicators.

In [8]:
taxonomy['criminal'] = taxonomy['stat_desc'].apply(lambda x: 0 if 'non-criminal' in x else 1)
taxonomy.loc[taxonomy['stat'] > 700, 'criminal'] = 0
for x in ['accident','suicide','person dead']:
    taxonomy.loc[:,'criminal'] = taxonomy.apply(lambda r: 0 if x in r['stat_desc'] else r['criminal'], axis = 1)

In [9]:
taxonomy['violent'] = taxonomy['stat'].apply(lambda x: 1 if x < 62 else 0)
for x in ['violence','by force','assault','abuse','hit and run','hate','arson','kidnapping','vandalism']:
    taxonomy.loc[:,'violent'] = taxonomy.apply(lambda r: 1 if x in r['stat_desc'] else r['violent'], axis = 1)

In [10]:
taxonomy['weapon'] = taxonomy['stat_desc'].apply(lambda x: 1 if 'weapon' in x else 0)
for x in ['gun','knife','firearm','shooting']:
    taxonomy.loc[:,'weapon'] = taxonomy.apply(lambda r: 1 if x in r['stat_desc'] else r['weapon'], axis = 1)

In [11]:
taxonomy['gun'] = taxonomy['stat_desc'].apply(lambda x: 1 if 'gun' in x else 0)
for x in ['firearm','shooting']:
    taxonomy.loc[:,'gun'] = taxonomy.apply(lambda r: 1 if x in r['stat_desc'] else r['gun'], axis = 1)

In [12]:
taxonomy['misdemeanor'] = taxonomy['stat_desc'].apply(lambda x: 1 if 'misd' in x else 0)
taxonomy = taxonomy.sort_values(by = 'stat')

There we go:

In [13]:
print('stat C:?  M:?  V:?  W:?  G:?   description')
print('----------------------------------------------------------------------------------')
for i in range(len(taxonomy)):
    print('{:<4} C:{}  M:{}  V:{}  W:{}  G:{}   {}'.format(taxonomy.iloc[i]['stat'],
                                                           taxonomy.iloc[i]['criminal'],
                                                           taxonomy.iloc[i]['misdemeanor'],
                                                           taxonomy.iloc[i]['violent'],
                                                           taxonomy.iloc[i]['weapon'],
                                                           taxonomy.iloc[i]['gun'],
                                                           taxonomy.iloc[i]['stat_desc']))

stat C:?  M:?  V:?  W:?  G:?   description
----------------------------------------------------------------------------------
11   C:1  M:0  V:1  W:0  G:0   criminal homicide: murder
12   C:1  M:0  V:1  W:0  G:0   criminal homicide: manslaughter, voluntary/involuntary
13   C:1  M:0  V:1  W:1  G:1   deputy involved shooting/suspect dead
21   C:1  M:0  V:1  W:0  G:0   rape, forcible: rape by force, female 18 yrs +
22   C:1  M:0  V:1  W:0  G:0   rape, forcible: attempt rape by force, female 18 yrs +
23   C:1  M:0  V:1  W:0  G:0   rape by force, female under 18
24   C:1  M:0  V:1  W:0  G:0   attempt forcible rape, female under 18
25   C:1  M:0  V:1  W:0  G:0   rape by force, male 18 and over
26   C:1  M:0  V:1  W:0  G:0   attempt rape by force, male 18 and over
27   C:1  M:0  V:1  W:0  G:0   rape by force, male under 18
28   C:1  M:0  V:1  W:0  G:0   attempt rape by force, male under 18
31   C:1  M:0  V:1  W:1  G:0   robbery, weapon: highway, etc.
32   C:1  M:0  V:1  W:1  G:0   robbery, we

In [14]:
taxonomy.loc[:,'description'] = taxonomy['stat_desc'].apply(lambda x: x.strip())
fields = ['stat','description','category','short_cat','criminal','misdemeanor','violent','weapon','gun']
taxonomy[fields].to_csv('data/taxonomy.csv', index = False)