# Process Crime Data

In [12]:
import pickle
import numpy as np
import pandas as pd
import csv
import pandas as pd
import re # Regular Expressions
import math

from peewee import *
from Models import *
from geopy.distance import vincenty
from datetime import datetime

import dglim

# Load Datasets

In [13]:
# TODO: I'd like to avoid using master_df, but the coordinates in act_bus_df are not reliable
# TODO: Maybe generate a separate dataset just for coordinates?
master_df = dglim.loadData('Master Dataset')

act_bus_df = pd.read_excel("../City_Data/DGLIM_survey_data/city of Gainesville active businesses dataset with uniqueid (fixed coords).xlsx")
crime_incidents_df = pd.read_csv("../City_Data/Crime_Incidents_2011_-_Present.csv")

# Process Crime Data

In [14]:
print "Total number of crimes:", len(crime_incidents_df)

Total number of crimes: 123590


** Format GPS coordinates **

In [15]:
def extractCoordinates(string):
    # Extract string coordinates from location entry
    string = str(string) # Convert unicode to str if necessary
    values_list = [x for x in re.split('[()]', string) if x != '']

    if len(values_list) > 1:
        str_coords = values_list[len(values_list) - 1]
        coords = str_coords.split(',')
        return [float(coords[0]), float(coords[1])]
    else:
        return [np.nan, np.nan]

In [16]:
crime_incidents_df['Latitude'], crime_incidents_df['Longitude'] = [np.nan, np.nan]
crime_incidents_df[['Latitude', 'Longitude']] = map(lambda x: extractCoordinates(x), crime_incidents_df['Location'].tolist())

** Sort coordinates by latitude first, then longitude **

In [17]:
crime_incidents_df.dropna(inplace=True)
crime_incidents_df.sort_values(by=['Latitude', 'Longitude'], inplace=True)
crime_incidents_df.reset_index(inplace=True, drop=True)

** Gather crimes within 500 meters of each business **

In [18]:
pd.Series(crime_incidents_df['Latitude'].value_counts())

29.662839    1532
29.661941     847
29.624470     827
29.652108     756
29.651923     587
29.659540     558
29.659421     538
29.633966     450
29.708669     427
29.652738     405
29.710005     392
29.657548     383
29.652098     372
29.682581     369
29.659383     357
29.670571     342
29.651934     328
29.659533     326
29.659533     321
29.659564     314
29.659173     312
29.651979     311
29.688433     295
29.634364     294
29.684060     285
29.636855     281
29.659729     281
29.634011     272
29.642844     272
29.659278     261
             ... 
29.944414       1
29.629997       1
29.614497       1
29.668117       1
29.617423       1
29.714387       1
29.724723       1
29.629436       1
29.649440       1
29.632217       1
29.692181       1
29.634181       1
29.693426       1
29.722317       1
29.831061       1
29.620671       1
29.661643       1
29.651031       1
29.639070       1
29.655950       1
29.629031       1
29.635904       1
29.671904       1
29.674891       1
29.645740 

In [19]:
lat_column_index = crime_incidents_df.columns.get_loc('Latitude')
lng_column_index = crime_incidents_df.columns.get_loc('Longitude')

In [20]:
crime_incidents_df

Unnamed: 0,ID,Incident Type,Report Date,Offense Date,City,State,Location,Latitude,Longitude
0,213000051,HARASSING/ OBSCENE CALLS,01/01/2013 09:24:20 PM,01/01/2013 05:00:19 PM,MIAMI,FL,"5449 SW 86TH ST\nMIAMI, FL\n(25.69144767000006...",25.691448,-80.282131
1,213002719,IDENTITY THEFT,02/08/2013 11:51:10 AM,10/24/2012 12:00:00 AM,HIALEAH,FL,"134 E 9TH ST\nHIALEAH, FL\n(25.830013015000077...",25.830013,-80.278180
2,213020222,ASSIST OTHER AGENCY,10/09/2013 04:02:00 AM,10/09/2013 01:40:59 AM,HAWTHORN,FL,"21419 SE 61ST AVE\nHAWTHORN, FL\n(26.733470912...",26.733471,-80.229968
3,514000376,DRUG VIOLATION,11/17/2014 01:40:33 PM,11/17/2014 01:40:32 PM,COUNTY,FL,"377 MM SR 93 SOUTHBOUND\nCOUNTY, FL\n(26.77438...",26.774381,-80.112507
4,514000517,DRUG VIOLATION,11/18/2014 11:58:01 AM,11/18/2014 11:58:00 AM,COUNTY,FL,"380 MM SR 93 NORTHBOUND\nCOUNTY, FL\n(26.77438...",26.774381,-80.112507
5,514000519,DRUG VIOLATION (SID),12/04/2014 09:35:15 AM,12/02/2014 02:41:14 PM,COUNTY,FL,"380 MM SR 93 NORTHBOUND\nCOUNTY, FL\n(26.77438...",26.774381,-80.112507
6,511000080,DRUG VIOLATION,02/28/2011 03:22:03 PM,02/28/2011 03:00:02 PM,COUNTY,FL,"SR 93 392 SB\nCOUNTY, FL\n(26.774380653000037,...",26.774381,-80.112507
7,214024443,SUSPICIOUS INCIDENT,12/10/2014 10:31:04 AM,12/10/2014 10:31:03 AM,COUNTY,FL,"3333 NE 39TH AVE\nCOUNTY, FL\n(26.774380653000...",26.774381,-80.112507
8,213019493,WARRANT ARREST,09/28/2013 06:25:25 PM,09/28/2013 06:25:00 PM,COUNTY,FL,"220 S MAIN ST\nCOUNTY, FL\n(26.774380653000037...",26.774381,-80.112507
9,511000493,INFORMATION,08/21/2011 08:30:35 PM,08/19/2011 09:45:34 AM,COUNTY,FL,"SR 93 MM 403 SB\nCOUNTY, FL\n(26.7743806530000...",26.774381,-80.112507


In [21]:
def countCrimesWithinRadius(center_coords, year_set, search_radius=500):

    total = pd.Series(index=year_set, data=0)

    # Use fixed values to find meters between lat/lng coordinates
    # The area of interest is small enough that variations in longitude at different latitudes can be ignored
    lat_scale = 110846.58 # Approximate length in meters of one degree of latitude in Gainesville
    lng_scale = 96821.46 # Approximate length in meters of one degree of longitude in Gainesville
    search_radius_squared = search_radius ** 2 # meters^2
    lat_search_threshold = search_radius/lat_scale

    # Find indices of the northernmost and southernmost crimes whose latitudes are close enough to the business to
    # potentially be within the search radius
    start_lat_index = crime_incidents_df['Latitude'].searchsorted(
        value=center_coords[0] - lat_search_threshold, side='left')[0]
    stop_lat_index = crime_incidents_df['Latitude'].searchsorted(
        value=center_coords[0] + lat_search_threshold, side='right')[0]

    if (start_lat_index >= len(crime_incidents_df)):
        return total

    start_lat = crime_incidents_df.get_value(start_lat_index, 'Latitude')
    stop_lat = crime_incidents_df.get_value(stop_lat_index, 'Latitude')

    lat_slice_df = crime_incidents_df[start_lat_index:stop_lat_index]

    # For each latitude, do the same for longitudes
    # Note: lat_index is an integer index, i.e. "0" indicates the first entry in lat_slice_df
    lat_slice_df['Latitude'].value_counts()
    unique_lat_indices = lat_slice_df['Latitude'].drop_duplicates().index
    for lat_index in unique_lat_indices:
        lat_value = lat_slice_df.get_value(lat_index, 'Latitude')
        lng_slice_df = lat_slice_df[lat_slice_df['Latitude'] == lat_value]

        # Calculate max difference in longitude where crimes are still within the search radius
        lng_search_threshold = \
            math.sqrt(search_radius_squared - (abs(lat_value - center_coords[0]) * lat_scale) ** 2) / lng_scale

        # Get range of indices that are within the lower/upper longitude bounds
        start_lng_pos = lng_slice_df['Longitude'].searchsorted(
            value=center_coords[1] - lng_search_threshold, side='left')[0]
        stop_lng_pos = lng_slice_df['Longitude'].searchsorted(
            value=center_coords[1] + lng_search_threshold, side='right')[0]

        # All indices must be for crimes within the specified radius
        for pos in range(start_lng_pos, stop_lng_pos):
            crime_date = datetime.strptime(lng_slice_df.iloc[pos]['Offense Date'], '%m/%d/%Y %I:%M:%S %p')
            year = crime_date.year
            if year in total.index:
                total[year] += 1

    return total

In [22]:
# Get coordinates for each business
# act_bus_df['Coords'] = act_bus_df['Location'].apply(extractCoordinates)

year_set = range(2011, 2018);

# Count crimes near each business
temp_df = pd.DataFrame(index=master_df.index)
temp_df['Coords'] = zip(master_df['Latitude'], master_df['Longitude'])
crime_counts_df = temp_df['Coords'].apply(lambda x: countCrimesWithinRadius(x, year_set, 500))

# Save results
dglim.saveData(crime_counts_df, 'Crimes Within 500m by DGLIM ID')

In [23]:
crime_counts_df

Unnamed: 0_level_0,2011,2012,2013,2014,2015,2016,2017
DGLIM ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16,79,84,79,65,32,48,64
17,675,625,596,543,337,532,564
24,63,71,73,80,31,49,44
25,1174,1170,957,711,393,572,642
38,210,205,172,164,120,173,148
43,789,799,603,549,273,425,428
51,53,62,52,65,25,67,43
52,80,88,114,78,49,86,85
59,325,293,304,222,105,216,179
75,333,309,313,221,101,216,182
