In [1]:
%matplotlib inline
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import numpy as np
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets.species_distributions import construct_grids
from sklearn.neighbors import KernelDensity
from sklearn import preprocessing
import itertools
import datetime

from mpl_toolkits.basemap import Basemap

def parse_location(loc):
    loc = loc.strip("()").split(',')
    lat = loc[0].strip()
    long = loc[1].strip()
    return float(lat), float(long)

In [2]:
def get_yelp_data():
    df = pd.read_csv('../data/weather_police_traffic_yelp_crime_data.csv', parse_dates=['timestamp'])
    return df[df.yelp == 1][['lat', 'long']].drop_duplicates().reset_index()[['lat', 'long']]

def get_police_station_data():
    df = pd.read_csv('../data/weather_police_traffic_yelp_crime_data.csv', parse_dates=['timestamp'])
    return df[df.police_station == 1][['lat', 'long']].drop_duplicates().reset_index()[['lat', 'long']]

def get_crime_data_for_year(year):
    crime_data = pd.read_csv("../data/Preprocessed_Crime_Data_%s.csv"%year, parse_dates=['Date'], usecols=["Date","IUCR","Location"])
    crime_data['lat']  = crime_data.Location.apply(lambda x: parse_location(x)[0])
    crime_data['long']  = crime_data.Location.apply(lambda x: parse_location(x)[1])
    del crime_data["Location"]
    crime_data.rename(columns={"Date":"timestamp"}, inplace=True)
    crime_data['timestamp'] = crime_data.timestamp.apply(lambda x: x.date())
    return crime_data

def get_weather_data_for_year(year):
    weather_data = pd.read_csv("../data/PreProcessed_Weather_Data_%s.csv"%year)
    weather_data.rename(columns={"Weather_Date":"timestamp"}, inplace=True)
    return weather_data

In [3]:
def get_normalizer(column):
    X = np.array(column).reshape(column.shape[0], 1)
    return preprocessing.Normalizer().fit(X)

def normalize(df, column_name):
    X = np.array(df[column_name]).reshape(df[column_name].shape[0], 1)
    return get_normalizer(df[column_name]).transform(X)

def get_scaler(column):
    X = np.array(column).reshape(column.shape[0], 1)
    return preprocessing.MinMaxScaler().fit(X)

def scale(df, column_name):
    X = np.array(df[column_name]).reshape(df[column_name].shape[0], 1)
    return get_scaler(df[column_name]).transform(X)

def normalize_columns(df, column_names):
    for column_name in column_names:
        df['%s_norm' % column_name] = normalize(df, column_name)
        
def scale_columns(df, column_names):
    for column_name in column_names:
        df['%s_scaled' % column_name] = scale(df, column_name)

In [4]:
def get_locations(df, metric='radians', drop_duplicates=True):
    if drop_duplicates:
        locations_df = df[['lat','long']].drop_duplicates()
    else:
        locations_df = df[['lat','long']]
    locations_array = np.vstack([locations_df['lat'],locations_df['long']]).T
    if metric == 'radians':
        locations_array *= np.pi / 180.  # Convert lat/long to radians
    return locations_array

def get_kde(df, drop_duplicates=True, bandwidth=0.00025, rtol=1E-8):
    locations = get_locations(df, drop_duplicates=drop_duplicates)
    #KDE initialization
    kde = KernelDensity(bandwidth=bandwidth, metric='haversine', kernel='gaussian', algorithm='ball_tree', rtol=rtol)
    #fit with given location
#     print(locations.shape, drop_duplicates)
    kde.fit(locations)
    return kde

def get_weights(df, kernel, metric='radians', lat_label='lat', long_label='long'):
    locations_array = np.vstack([df[lat_label], df[long_label]]).T
    if metric == 'radians':
        locations_array *= np.pi / 180.  # Convert lat/long to radians
    if kernel == None:
        return None
    else:
        return np.exp(kernel.score_samples(locations_array))

In [5]:
%time yelp_df = get_yelp_data()
%time police_df = get_police_station_data()
%time crime_df = get_crime_data_for_year(2006)
%time weather_df = get_weather_data_for_year(2006)

  call = lambda f, *a, **k: f(*a, **k)
  call = lambda f, *a, **k: f(*a, **k)


CPU times: user 13.5 s, sys: 160 ms, total: 13.6 s
Wall time: 13.6 s
CPU times: user 8.76 s, sys: 220 ms, total: 8.98 s
Wall time: 8.98 s
CPU times: user 1min 9s, sys: 0 ns, total: 1min 9s
Wall time: 1min 9s
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.64 ms


In [6]:
df = pd.read_csv('grids_full_year_2500_v2.tsv', sep='\t')
df.describe()

Unnamed: 0,crime_freq,yelp_freq,police_freq
count,912500.0,912500.0,912500.0
mean,0.488268,5.1756,0.0092
std,2.139128,34.54762,0.095474
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,63.0,957.0,1.0


# Adding KDE based spatial features

In [7]:
def parse_location(loc):
    loc = loc.strip("()").split(',')
    lat = loc[0].strip()
    long = loc[1].strip()
    return float(lat), float(long)

def parse_cell_range(loc):
    loc = loc.strip("()").split('), (')
    ll = parse_location(loc[0].strip())
    ur = parse_location(loc[1].strip())
    return tuple(ll), tuple(ur)

df['lat'] = df.cell_range.apply(lambda x: (parse_cell_range(x)[0][0]+parse_cell_range(x)[1][0])/2)
df['long'] = df.cell_range.apply(lambda x: (parse_cell_range(x)[0][1]+parse_cell_range(x)[1][1])/2)
df.head()

Unnamed: 0,cell_range,timestamp,crime_freq,yelp_freq,police_freq,lat,long
0,"((41.5487, -88.3713), (41.560078, -88.345754))",2006-01-01,0,0,0,41.554389,-88.358527
1,"((41.5487, -88.345754), (41.560078, -88.320208...",2006-01-01,0,0,0,41.554389,-88.332981
2,"((41.5487, -88.32020800000001), (41.560078, -8...",2006-01-01,0,0,0,41.554389,-88.307435
3,"((41.5487, -88.294662), (41.560078, -88.269116...",2006-01-01,0,0,0,41.554389,-88.281889
4,"((41.5487, -88.26911600000001), (41.560078, -8...",2006-01-01,0,0,0,41.554389,-88.256343


In [8]:
%time df['police_factor'] = get_weights(df, get_kde(police_df, bandwidth=0.0008))

CPU times: user 13.1 s, sys: 0 ns, total: 13.1 s
Wall time: 13.1 s


In [9]:
%time df['yelp_factor'] = get_weights(df, get_kde(yelp_df, bandwidth=0.0008, drop_duplicates=False))

CPU times: user 22min 35s, sys: 892 ms, total: 22min 36s
Wall time: 22min 34s


In [10]:
# %time crime_kde = get_kde(crime_df, bandwidth=0.0008, drop_duplicates=False)

In [11]:
# %time get_weights(df.head(), crime_kde)

In [12]:
# %time df['crime_factor'] = get_weights(df, crime_kde)

In [18]:
print('hi')

hi


In [19]:
%time df.to_csv('features_full_year_2500.tsv', sep='\t', index=False)
%time df = pd.read_csv('features_full_year_2500.tsv', sep='\t', parse_dates=['timestamp'])

CPU times: user 18.1 s, sys: 88 ms, total: 18.2 s
Wall time: 19.5 s
CPU times: user 1.91 s, sys: 20 ms, total: 1.93 s
Wall time: 1.93 s


# Adding temporal features

In [21]:
df.tail()

Unnamed: 0,cell_range,timestamp,crime_freq,yelp_freq,police_freq,lat,long,police_factor,yelp_factor
912495,"((42.106222, -87.22173), (42.1176, -87.1961839...",2006-12-31,0,0,0,42.111911,-87.208957,4.879026e-10,1.166006e-09
912496,"((42.106222, -87.19618399999999), (42.1176, -8...",2006-12-31,0,0,0,42.111911,-87.183411,2.087249e-11,1.626442e-10
912497,"((42.106222, -87.170638), (42.1176, -87.145091...",2006-12-31,0,0,0,42.111911,-87.157865,7.554142e-13,2.534506e-13
912498,"((42.106222, -87.14509199999999), (42.1176, -8...",2006-12-31,0,0,0,42.111911,-87.132319,2.313308e-14,5.294079e-11
912499,"((42.106222, -87.119546), (42.1176, -87.094))",2006-12-31,0,0,0,42.111911,-87.106773,5.994916e-16,1.012007e-15


In [23]:
def get_previous_day_crime_freq(self, main_df):
    prev_day = main_df[main_df.cell_range == self.cell_range][main_df.timestamp == \
                                                              (self.timestamp - datetime.timedelta(1))]['crime_freq']
    if prev_day.empty:
        return None
    else:
        return prev_day.values[0]
    
    
def get_previous_k_days_crime_freq(self, main_df, k):
    prev_day = main_df[main_df.cell_range == self.cell_range]\
                [main_df.timestamp >= (self.timestamp - datetime.timedelta(k))]\
                        [main_df.timestamp < self.timestamp]['crime_freq']
    if prev_day.empty:
        return None
    else:
        return prev_day.mean()

In [25]:
# %time df['prev_day_crime_freq'] = df.apply(lambda x: get_previous_day_crime_freq(x, df), axis=1)
# %time df['prev_7_days_crime_freq'] = df.apply(lambda x: get_previous_k_days_crime_freq(x, df,7), axis=1)

In [None]:
# %time df.to_csv('features_temporal_full_year_2500.tsv', sep='\t', index=False)

In [35]:
feature_df = pd.read_csv('features_full_year_temporal_features_2500.tsv', sep='\t')
feature_df.describe()

Unnamed: 0,crime_freq,yelp_freq,police_freq,lat,long,police_factor,yelp_factor,avg_one_week,prev_day_crime
count,912500.0,912500.0,912500.0,912500.0,912500.0,912500.0,912500.0,895000.0,910000.0
mean,0.488268,5.1756,0.0092,41.833038,-87.720383,6612.75,6611.821,0.488941,0.488588
std,2.139128,34.54762,0.095474,0.164193,0.357876,15599.98,18660.1,2.009222,2.140379
min,0.0,0.0,0.0,41.554389,-88.358527,7.041842999999999e-26,1.012007e-15,0.0,0.0
25%,0.0,0.0,0.0,41.690925,-88.026429,2.674969e-06,2.056975,0.0,0.0
50%,0.0,0.0,0.0,41.830306,-87.726264,1.873981,33.09285,0.0,0.0
75%,0.0,0.0,0.0,41.975375,-87.413325,2256.473,1888.561,0.0,0.0
max,63.0,957.0,1.0,42.111911,-87.106773,78997.88,127526.4,27.714286,63.0


In [36]:
%time weather_df = get_weather_data_for_year(2006)
weather_df.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3 ms


Unnamed: 0,timestamp,PRCP,SNOW,TMAX,TMIN
0,2006-01-01,0.8,0,7.2,-3.3
1,2006-01-02,9.4,0,5.6,4.4
2,2006-01-03,0.0,0,5.6,3.9
3,2006-01-04,0.5,0,6.1,2.2
4,2006-01-05,0.0,0,2.8,-0.6


In [45]:
new_feature_df = pd.merge(feature_df, weather_df, on='timestamp', how='left')
new_feature_df.rename(columns={
        'prev_day_crime': 'prev_day_crime_freq',
        'avg_one_week': 'prev_7_days_crime_freq',
    }, inplace=True)

In [48]:
new_feature_df.describe()

Unnamed: 0,crime_freq,yelp_freq,police_freq,lat,long,police_factor,yelp_factor,prev_7_days_crime_freq,prev_day_crime_freq,PRCP,SNOW,TMAX,TMIN
count,912500.0,912500.0,912500.0,912500.0,912500.0,912500.0,912500.0,895000.0,910000.0,912500.0,912500.0,912500.0,912500.0
mean,0.488268,5.1756,0.0092,41.833038,-87.720383,6612.75,6611.821,0.488941,0.488588,2.921096,1.405479,15.898082,6.308493
std,2.139128,34.54762,0.095474,0.164193,0.357876,15599.98,18660.1,2.009222,2.140379,7.50011,10.512619,10.456403,9.397525
min,0.0,0.0,0.0,41.554389,-88.358527,7.041842999999999e-26,1.012007e-15,0.0,0.0,0.0,0.0,-13.3,-21.7
25%,0.0,0.0,0.0,41.690925,-88.026429,2.674969e-06,2.056975,0.0,0.0,0.0,0.0,6.7,-1.1
50%,0.0,0.0,0.0,41.830306,-87.726264,1.873981,33.09285,0.0,0.0,0.0,0.0,15.6,5.0
75%,0.0,0.0,0.0,41.975375,-87.413325,2256.473,1888.561,0.0,0.0,1.3,0.0,25.0,14.4
max,63.0,957.0,1.0,42.111911,-87.106773,78997.88,127526.4,27.714286,63.0,52.1,147.0,37.2,26.7


In [49]:
new_feature_df.to_csv('features_temporal_full_year_with_weather_2500_final.tsv', sep='\t', index=False)