# Process Crime Data

In [1]:
# Data processing
import numpy as np
import pandas as pd
import re # Regular Expressions
import math
from datetime import datetime

# Common DGLIM utilities
import os, sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import dglim
dglim.setProjectPath('../../')

## Load Datasets

In [12]:
business_locations_df = dglim.loadData('Active Business Locations')
crime_incidents_df = pd.read_csv(dglim.datasets_path + 'Crime_Incidents_2011_-_Present.csv')

## Process Crime Data

In [3]:
print "Total number of crimes:", len(crime_incidents_df)

Total number of crimes: 123590


** Format GPS coordinates **

In [4]:
def extractCoordinates(string):
    # Extract string coordinates from location entry
    string = str(string) # Convert unicode to str if necessary
    values_list = [x for x in re.split('[()]', string) if x != '']

    if len(values_list) > 1:
        str_coords = values_list[len(values_list) - 1]
        coords = str_coords.split(',')
        return [float(coords[0]), float(coords[1])]
    else:
        return [np.nan, np.nan]

In [5]:
crime_incidents_df['Latitude'], crime_incidents_df['Longitude'] = [np.nan, np.nan]
crime_incidents_df[['Latitude', 'Longitude']] = map(lambda x: extractCoordinates(x), crime_incidents_df['Location'].tolist())

** Sort coordinates by latitude first, then longitude **

In [6]:
crime_incidents_df.dropna(inplace=True)
crime_incidents_df.sort_values(by=['Latitude', 'Longitude'], inplace=True)
crime_incidents_df.reset_index(inplace=True, drop=True)

** Gather crimes within 500 meters of each business **

In [7]:
crime_incidents_df[:3]

Unnamed: 0,ID,Incident Type,Report Date,Offense Date,City,State,Location,Latitude,Longitude
0,213000051,HARASSING/ OBSCENE CALLS,01/01/2013 09:24:20 PM,01/01/2013 05:00:19 PM,MIAMI,FL,"5449 SW 86TH ST\nMIAMI, FL\n(25.69144767000006...",25.691448,-80.282131
1,213002719,IDENTITY THEFT,02/08/2013 11:51:10 AM,10/24/2012 12:00:00 AM,HIALEAH,FL,"134 E 9TH ST\nHIALEAH, FL\n(25.830013015000077...",25.830013,-80.27818
2,213020222,ASSIST OTHER AGENCY,10/09/2013 04:02:00 AM,10/09/2013 01:40:59 AM,HAWTHORN,FL,"21419 SE 61ST AVE\nHAWTHORN, FL\n(26.733470912...",26.733471,-80.229968


In [8]:
def countCrimesWithinRadius(center_coords, year_set, search_radius=500):

    # Use fixed values to find meters between lat/lng coordinates
    # The area of interest is small enough that variations in longitude at different latitudes can be ignored
    lat_scale = 110846.58 # Approximate length in meters of one degree of latitude in Gainesville
    lng_scale = 96821.46 # Approximate length in meters of one degree of longitude in Gainesville
    search_radius_squared = search_radius ** 2 # meters^2
    lat_search_threshold = search_radius/lat_scale

    # Find indices of the northernmost and southernmost crimes whose latitudes are close enough to the business to
    # potentially be within the search radius
    start_lat_index = crime_incidents_df['Latitude'].searchsorted(
        value=center_coords[0] - lat_search_threshold, side='left')[0]
    stop_lat_index = crime_incidents_df['Latitude'].searchsorted(
        value=center_coords[0] + lat_search_threshold, side='right')[0]

    total = pd.Series(index=year_set, data=0)
    if (start_lat_index >= len(crime_incidents_df)):
        return total

    start_lat = crime_incidents_df.get_value(start_lat_index, 'Latitude')
    stop_lat = crime_incidents_df.get_value(stop_lat_index, 'Latitude')

    lat_slice_df = crime_incidents_df[start_lat_index:stop_lat_index]

    # For each latitude, do the same for longitudes
    # Note: lat_index is an integer index, i.e. "0" indicates the first entry in lat_slice_df
    lat_slice_df['Latitude'].value_counts()
    unique_lat_indices = lat_slice_df['Latitude'].drop_duplicates().index
    for lat_index in unique_lat_indices:
        lat_value = lat_slice_df.get_value(lat_index, 'Latitude')
        lng_slice_df = lat_slice_df[lat_slice_df['Latitude'] == lat_value]

        # Calculate max difference in longitude where crimes are still within the search radius
        lng_search_threshold = \
            math.sqrt(search_radius_squared - (abs(lat_value - center_coords[0]) * lat_scale) ** 2) / lng_scale

        # Get range of indices that are within the lower/upper longitude bounds
        start_lng_pos = lng_slice_df['Longitude'].searchsorted(
            value=center_coords[1] - lng_search_threshold, side='left')[0]
        stop_lng_pos = lng_slice_df['Longitude'].searchsorted(
            value=center_coords[1] + lng_search_threshold, side='right')[0]

        # All of these indices must correspond to crimes within the specified radius
        for pos in range(start_lng_pos, stop_lng_pos):
            crime_date = datetime.strptime(lng_slice_df.iloc[pos]['Offense Date'], '%m/%d/%Y %I:%M:%S %p')
            year = crime_date.year
            if year in total.index:
                total[year] += 1

    return total

** WARNING ** This may take several hours!

In [9]:
year_set = range(2011, 2018);

# Count crimes near each business
temp_df = pd.DataFrame(index=business_locations_df.index)
temp_df['Coords'] = zip(business_locations_df['Latitude'], business_locations_df['Longitude'])
crime_counts_df = temp_df['Coords'].apply(lambda x: countCrimesWithinRadius(x, year_set, 500))

# Save results
dglim.saveData(crime_counts_df, 'Crimes Within 500m by GRU ID')

In [10]:
crime_counts_df

Unnamed: 0_level_0,2011,2012,2013,2014,2015,2016,2017
DGLIM ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16,79,84,79,65,32,48,64
17,675,625,596,543,337,532,564
24,63,71,73,80,31,49,44
25,1174,1170,957,711,393,572,642
38,210,205,172,164,120,173,148
43,789,799,603,549,273,425,428
51,53,62,52,65,25,67,43
52,80,88,114,78,49,86,85
59,325,293,304,222,105,216,179
75,333,309,313,221,101,216,182
