In [23]:
%matplotlib inline
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt

import itertools

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets.species_distributions import construct_grids
from sklearn.neighbors import KernelDensity
from sklearn import preprocessing

# Gathering Data

- Yelp
- Police Stations
- Crime
- Weather

In [14]:
def parse_location(loc):
    loc = loc.strip("()").split(',')
    lat = loc[0].strip()
    long = loc[1].strip()
    return float(lat), float(long)

In [15]:
def get_yelp_data():
    df = pd.read_csv('../data/weather_police_traffic_yelp_crime_data.csv', parse_dates=['timestamp'])
    return df[df.yelp == 1][['lat', 'long']].drop_duplicates().reset_index()[['lat', 'long']]

def get_police_station_data():
    df = pd.read_csv('../data/weather_police_traffic_yelp_crime_data.csv', parse_dates=['timestamp'])
    return df[df.police_station == 1][['lat', 'long']].drop_duplicates().reset_index()[['lat', 'long']]

def get_crime_data_for_year(year):
    crime_data = pd.read_csv("../data/Preprocessed_Crime_Data_%s.csv"%year, parse_dates=['Date'], usecols=["Date","IUCR","Location"])
    crime_data['lat']  = crime_data.Location.apply(lambda x: parse_location(x)[0])
    crime_data['long']  = crime_data.Location.apply(lambda x: parse_location(x)[1])
    del crime_data["Location"]
    crime_data.rename(columns={"Date":"timestamp"}, inplace=True)
    crime_data['timestamp'] = crime_data.timestamp.apply(lambda x: x.date())
    return crime_data

def get_weather_data_for_year(year):
    weather_data = pd.read_csv("../data/PreProcessed_Weather_Data_%s.csv"%year)
    weather_data.rename(columns={"Weather_Date":"timestamp"}, inplace=True)
    return weather_data

In [16]:
%time yelp_df = get_yelp_data()
%time police_df = get_police_station_data()
%time crime_df = get_crime_data_for_year(2006)
%time weather_df = get_weather_data_for_year(2006)

  call = lambda f, *a, **k: f(*a, **k)
  call = lambda f, *a, **k: f(*a, **k)


CPU times: user 14 s, sys: 1.36 s, total: 15.3 s
Wall time: 15.3 s
CPU times: user 8.88 s, sys: 952 ms, total: 9.83 s
Wall time: 12.7 s
CPU times: user 1min 9s, sys: 120 ms, total: 1min 9s
Wall time: 1min 10s
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.48 ms


# Dividing Chicago City in a Grid

In [24]:
class City:
    def __init__(self, x_left_lower_corner, y_left_lower_corner, x_upper_right_corner, 
                 y_upper_right_corner, grid_size=100):
    
        self.x_left_lower_corner = x_left_lower_corner
        self.y_left_lower_corner = y_left_lower_corner
        self.x_upper_right_corner = x_upper_right_corner
        self.y_upper_right_corner = y_upper_right_corner
        self.grid_size = grid_size
        self.dividers = self.construct_grid()

    
    def construct_grid(self):
        # x coordinates of the grid cells
        xgrid = np.linspace(self.x_left_lower_corner, self.x_upper_right_corner, self.grid_size)
        # y coordinates of the grid cells
        ygrid = np.linspace(self.y_left_lower_corner, self.y_upper_right_corner, self.grid_size)
        return (xgrid, ygrid)
    
    def get_map_coordinates(self):
        map_coordinates = []
        for i in itertools.product(self.dividers[0].tolist(),self.dividers[1].tolist()):
            map_coordinates.append(i)
        return map_coordinates
    
    def get_diagonals(self):
        map_coordinates = self.get_map_coordinates()
        n=self.grid_size
        diagonals = []
        for i in range(0, n*n - n-1):
            if(i!=0 and i%n==0):
                continue
            #print(i)
            diagonals.append((map_coordinates[i],map_coordinates[i+n+1]))
        return diagonals

In [26]:
chicago = City(41.5487, -88.3713, 42.1176, -87.094, 51)
cells = chicago.get_diagonals()
len(cells)

2500

# Counting number of Crime, Yelp and Poice Stations in a grid cell

In [27]:
def build_freq_df(cells):
    timestamps = []
    grid_cells = []
    for i in list(itertools.product(pd.date_range(start=pd.datetime(2006, 1, 1), periods=365, freq='D').tolist(), cells)):
        timestamps.append(i[0].date())
        grid_cells.append(i[1])
        
    df = pd.DataFrame({'timestamp': timestamps, 'cell_range': grid_cells})    
    df['crime_freq'] = df.apply(lambda x: crime_df[(crime_df.timestamp == x.timestamp)&
                                                   (crime_df.lat >= x.cell_range[0][0])&
                                                   (crime_df.lat < x.cell_range[1][0])&
                                                   (crime_df.long >= x.cell_range[0][1])&
                                                   (crime_df.long < x.cell_range[1][1])].shape[0], axis=1)
    spatial_df = df[['cell_range']].drop_duplicates()
    spatial_df['yelp_freq'] = df[['cell_range']].apply(lambda x: yelp_df[(yelp_df.lat >= x.cell_range[0][0])&
                                                   (yelp_df.lat < x.cell_range[1][0])&
                                                   (yelp_df.long >= x.cell_range[0][1])&
                                                   (yelp_df.long < x.cell_range[1][1])].shape[0], axis=1)
    spatial_df['police_freq'] = df[['cell_range']].apply(lambda x: police_df[(police_df.lat >= x.cell_range[0][0])&
                                               (police_df.lat < x.cell_range[1][0])&
                                               (police_df.long >= x.cell_range[0][1])&
                                               (police_df.long < x.cell_range[1][1])].shape[0], axis=1)

    return pd.merge(df, spatial_df, on=['cell_range'], how='left')


In [None]:
#cells = [((41.5487, -88.3713), ( 42.1176, -87.094))]
%time df = build_freq_df(cells) 

In [None]:
df.to_csv('grids_full_year_2500_v2.tsv', sep='\t', index=False)

In [None]:
1

In [44]:
#cells = [((41.5487, -88.3713), ( 42.1176, -87.094))]
%time df = build_freq_df(cells) 

CPU times: user 7min 7s, sys: 592 ms, total: 7min 8s
Wall time: 7min 7s


In [59]:
df.to_csv('grids_Jan_400.tsv', sep='\t', index=False)

In [61]:
%time df = build_freq_df(cells)

CPU times: user 39min 10s, sys: 3.74 s, total: 39min 13s
Wall time: 39min 11s


In [62]:
df.to_csv('grids_Jan_2500.tsv', sep='\t', index=False)

In [69]:
%time df = build_freq_df(cells)

CPU times: user 1h 13min 54s, sys: 7 s, total: 1h 14min 1s
Wall time: 1h 13min 57s


In [70]:
df.to_csv('grids_full_year_400_v2.tsv', sep='\t', index=False)