# Features

In [1]:
import os
import re
import numpy as np
import pandas as pd
from collections import Counter

from viz import local
viz = local.VizNotebook()

import nltk
stopwords = nltk.corpus.stopwords.words('english')

In [2]:
YEAR = 2016

sample = pd.read_csv('data/F{}.csv'.format(YEAR))
sample.head()

Unnamed: 0,category,stat,stat_desc,city,zip,year,month,day,weekday,hour,longitude,latitude,gang_related,aggravated,incident_date
0,vehicle,255,vehicle and boating laws: misdemeanor,covina,91722,2016,11,13,6,14.25,-117.884594,34.10673,0,0,2016-11-13
1,gta,93,"grand theft vehicle (gta): truck, motor home, ...",west valinda,91746,2016,12,29,3,9.5,-117.971268,34.064801,0,0,2016-12-29
2,federal,315,federal offenses with money: counterfeiting u....,agoura hills,91301,2016,12,30,4,15.733333,-118.76,34.12,0,0,2016-12-30
3,sex,129,sex felonies: all other sex felonies,rancho palos verdes,90275,2016,4,29,4,18.25,-118.330651,33.736982,0,0,2016-04-29
4,vehicle,255,vehicle and boating laws: misdemeanor,la puente,91744,2016,5,19,3,7.417222,-117.961194,34.034889,0,0,2016-05-19


In [3]:
print(len(sample))
print(len(sample[-sample['stat_desc'].str.contains('misd')]))

162825
126207


### 1. Categorical

In [4]:
sample = sample[-sample['stat_desc'].str.contains('misd')]

cat = sample.groupby(['category'])\
            .size().reset_index(name='count')\
            .sort_values('count', ascending = False)
cat

Unnamed: 0,category,count
13,larceny,35489
1,assault,22113
2,burglary,12982
11,gta,12701
9,fraud,7645
20,vandalism,6378
15,narcotics,5787
17,robbery,4695
23,weapon,2917
7,felonies,2353


In [5]:
cat = cat['category'].tolist()
violent = ['assault','arson','burglary','homicide','rape','robbery','vandalism']
terms = [t for t in re.split('[\W+]',' '.join(sample[sample['category'].isin(violent)]['stat_desc'].tolist())) \
             if len(t) > 1 and t not in cat and t not in stopwords]

In [6]:
print(len(terms))
terms = Counter(terms)
terms.most_common(20)

182674


[('non', 15253),
 ('aggravated', 13014),
 ('force', 12907),
 ('entry', 12194),
 ('etc', 11192),
 ('hands', 8980),
 ('feet', 8980),
 ('fist', 8980),
 ('agg', 8957),
 ('residence', 7646),
 ('violence', 7408),
 ('domestic', 7408),
 ('structure', 5554),
 ('adw', 5485),
 ('night', 4994),
 ('felony', 4761),
 ('day', 4081),
 ('unknown', 3907),
 ('strong', 2615),
 ('arm', 2615)]

In [7]:
def set_spec(df):
    for term in ['gun','shooting','knife','carjacking','kidnapping']:
        df[term] = df['stat_desc'].apply(lambda x: 1 if term in x else 0)
    df['gun'] = df.apply(lambda r: 1 if r['shooting'] == 1 else r['gun'], axis = 1)

set_spec(sample)

In [8]:
def set_violent(r):
    value = 0
    if r['category'] in violent:
        value = 1
    if r['kidnapping'] == 1:
        value = 1
    if 'by force' in r['stat_desc']:
        value = 1
    elif 'no force' in r['stat_desc']:
        value = 0
    return value

sample['violent'] = sample.apply(set_violent, axis = 1)

In [9]:
f = ['category','stat_desc','aggravated','violent','gun','shooting','knife','carjacking','kidnapping']
n = np.random.randint(0, len(sample) - 20)
sample[sample['violent'] != 0].loc[n:n + 20, f]

Unnamed: 0,category,stat_desc,aggravated,violent,gun,shooting,knife,carjacking,kidnapping
53881,assault,"assault, aggravated: adw - hands, feet, fist, ...",1,1,0,0,0,0,0
53889,assault,"assault, aggravated: adw - knife",1,1,0,0,1,0,0
53895,vandalism,vandalism felony,0,1,0,0,0,0,0
53896,assault,"assault, non-agg: hands, feet, fist, etc.",0,1,0,0,0,0,0


### 2. Temporal

In [10]:
print(len(sample))
sample = sample[sample['year'] == YEAR]
print(len(sample))

139151
135981


In [11]:
trends = sample\
    .groupby(['incident_date','violent']).size()\
    .reset_index(name = 'total')

trends['incident_date'] = trends['incident_date'].apply(lambda x: str(x))
trends['violent'] = trends['violent'].apply(lambda x: 'Violent' if x == 1 else 'Non-Violent')

In [12]:
viz.static(trends.to_json(orient='records'), type='line', height=350, width='100%',
           x = 'incident_date',
           y = 'total',
           z = 'violent',
           colormap = { 'Violent':'crimson', 'Non-Violent':'gray' },
           title = '{} Violent vs. Non-Violent Criminal Incidents Daily Count'.format(YEAR))

In [13]:
# missing time flagged with -1
# we might decide to add ucertain time to this category too
def time_uncertain(x):
    #if x == 0: return -1
    #if x == 12: return -1
    return x

sample['hour'] = sample['hour'].apply(time_uncertain)

In [14]:
viz.static(sample[(sample['violent'] == 1)][['weekday','hour']].to_json(orient='records'),
           type = 'matrix', height = 300, width = '95%',
           x = 'hour',
           y = 'weekday',
           colormap = ['lightgray','crimson'],
           xlabel = 'Day Time',
           ylabel = 'Week Time',
           title = '{} Violent Incidents Heatmap'.format(YEAR),
           xbins = 25,
           ybins = 7,
           zbins = 3)

In [15]:
print(len(sample))
sample = sample[sample['hour'] != -1]
print(len(sample))

135981
135981


### 3. Spacial

In [16]:
viz.static(sample[sample['category'] == 'homicide'][['latitude','longitude']].to_json(orient='records'),
           type = 'matrix', height = 400, width = '95%',
           x = 'longitude',
           y = 'latitude',
           colormap = ['lightgray','crimson'],
           xlabel = 'Longitude',
           ylabel = 'Latitude',
           title = '{} Homicide Incidents Heatmap'.format(YEAR),
           xbins = 20,
           ybins = 20,
           zbins = 5)

In [17]:
viz.static(sample[sample['knife'] == 1][['latitude','longitude']].to_json(orient='records'),
           type = 'matrix', height = 400, width = '95%',
           x = 'longitude',
           y = 'latitude',
           colormap = ['lightgray','crimson'],
           xlabel = 'Longitude',
           ylabel = 'Latitude',
           title = '{} Knife-Incidents Heatmap'.format(YEAR),
           xbins = 20,
           ybins = 20,
           zbins = 5)

In [18]:
viz.static(sample[sample['gun'] == 1][['latitude','longitude']].to_json(orient='records'),
           type = 'matrix', height = 400, width = '95%',
           x = 'longitude',
           y = 'latitude',
           colormap = ['lightgray','crimson'],
           xlabel = 'Longitude',
           ylabel = 'Latitude',
           title = '{} Gun-Incidents Heatmap'.format(YEAR),
           xbins = 20,
           ybins = 20,
           zbins = 5)

In [19]:
viz.static(sample[sample['category'] == 'rape'][['latitude','longitude']].to_json(orient='records'),
           type = 'matrix', height = 400, width = '95%',
           x = 'longitude',
           y = 'latitude',
           colormap = ['lightgray','crimson'],
           xlabel = 'Longitude',
           ylabel = 'Latitude',
           title = '{} Rape-Incidents Heatmap'.format(YEAR),
           xbins = 20,
           ybins = 20,
           zbins = 5)

In [19]:
sample.columns

Index(['category', 'stat', 'stat_desc', 'city', 'zip', 'year', 'month', 'day',
       'weekday', 'hour', 'longitude', 'latitude', 'gang_related',
       'aggravated', 'incident_date', 'gun', 'shooting', 'knife', 'carjacking',
       'kidnapping', 'violent'],
      dtype='object')

In [20]:
keep = ['category', 'city', 'zip', 'year', 'month', 'day',
        'weekday', 'hour', 'longitude', 'latitude', 'gang_related',
        'aggravated', 'incident_date', 'gun', 'shooting', 'knife', 'carjacking',
        'kidnapping', 'violent']

In [21]:
# process all
for YEAR in range(2005, 2017):
    df = pd.read_csv('data/F{}.csv'.format(YEAR))
    N = len(df)
    df = df[-df['stat_desc'].str.contains('misd')]
    set_spec(df)
    df['violent'] = df.apply(set_violent, axis = 1)
    #df['hour'] = df['hour'].apply(time_uncertain)
    #df = df[df['hour'] != -1]
    df[keep].to_csv('data/F{}.csv'.format(YEAR), index = False)
    print('{} --- using {:.2f}%'.format(YEAR, 100 * len(df)/N))

2005 --- using 77.04%
2006 --- using 75.42%
2007 --- using 74.11%
2008 --- using 72.51%
2009 --- using 73.00%
2010 --- using 73.88%
2011 --- using 78.24%
2012 --- using 80.53%
2013 --- using 81.34%
2014 --- using 82.35%
2015 --- using 77.95%
2016 --- using 77.51%


In [22]:
import folium
from folium.plugins import MarkerCluster

MAP = folium.Map(
    location = [34.0, -118.2],
    tiles = 'Stamen Toner',
    zoom_start = 12
)

cluster = MarkerCluster(
    name = 'Clustered homicide-incident locations',
    overlay = True,
    control = False,
    icon_create_function = None
)

homicide = df[(df['category'] == 'homicide') & (df['year'] == YEAR)]
for i in range(len(homicide)):
    marker = folium.Marker((homicide['latitude'].values[i], homicide['longitude'].values[i]),
                               icon = folium.Icon(color = 'red'))
    cluster.add_child(marker)

cluster.add_to(MAP)
folium.LayerControl().add_to(MAP)

MAP.save('{}/LA-crime/node-app/static/Homicide{}.html'.format(os.environ['PROJECTS_HOME'], YEAR))

In [23]:
from IPython.display import IFrame
IFrame('/projects/LA-crime/Homicide{}.html'.format(YEAR), width='100%', height=500)