# Data Preprocessing

This notebook is used to clean, reduce, transform, integrate and discretize the final data set.

In [1]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
import pickle

## Methods

In [None]:
def discretizeTime(time):
    '''
    function to map 24-hours format time to one of the six 4 hours intervals.
    '''
    if '00:00' <= time < '04:00':
        return 'T1'
    elif '04:00' <= time < '08:00':
        return 'T2'
    elif '08:00' <= time < '12:00':
        return 'T3'
    elif '12:00' <= time < '16:00':
        return 'T4'
    elif '16:00' <= time < '20:00':
        return 'T5'
    elif '20:00' <= time < '24:00':
        return 'T6'

In [None]:
def categorizeResolution(status):
    '''
    function to tag a resolution as No if the crime is not resolved, otherwise Yes.
    '''
    if status == 'NONE':
        return 'No'
    else:
        return 'Yes'

In [None]:
def calculateDistance(src, dst):
    '''
    function to calculate the distance between two locations on earth
    using src & dst tuples given in the format (latitude, longitude).
    '''
    # approximate radius of earth in km
    R = 6373.0

    # approximate 1 km to miles conversion
    to_miles = 0.621371

    lat1 = radians(abs(src[0]))
    lon1 = radians(abs(src[1]))
    lat2 = radians(abs(dst[0]))
    lon2 = radians(abs(dst[1]))

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c * to_miles

In [None]:
def isNear(location, data, radius):
    '''
    function to determine if the given location (latitude, longitude)
    is near to any location in the given data (dataframe) based on the given radius.
    '''
    for index, row in data.iterrows():
        if calculateDistance(location, (row['latitude'], row['longitude'])) <= radius:
            return 'Yes'
    return 'No'

## Process

In [2]:
crime_data = pd.read_csv('data/crime_data.csv')

In [3]:
crime_data.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006027571000
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821003074
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821004014
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821015200
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009822628160


In [4]:
crime_data = crime_data.replace(np.nan, 'N/A')
crime_data = crime_data.drop(['IncidntNum', 'Descript', 'Location', 'PdId'], axis=1)
crime_data = crime_data[crime_data.PdDistrict != 'N/A']
crime_data = crime_data[crime_data.Category != 'NON-CRIMINAL']

In [5]:
crime_data = crime_data.rename(str.lower, axis='columns')
crime_data = crime_data.rename(index=str, columns={"dayofweek": "day", "pddistrict": "district", "address": "location", "x": "longitude", "y": "latitude"})

In [6]:
crime_data.head()

Unnamed: 0,category,day,date,time,district,resolution,location,longitude,latitude
1,ROBBERY,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191
2,ASSAULT,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191
3,SECONDARY CODES,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191
4,VANDALISM,Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469
6,SECONDARY CODES,Saturday,01/31/2015,21:00,BAYVIEW,NONE,700 Block of KIRKWOOD AV,-122.374019,37.729203


In [7]:
date = crime_data['date'].str.split('/')
month = date.apply(lambda x: x[0])
year = date.apply(lambda x: x[2])
crime_data['month'] = month
crime_data['year'] = year
crime_data = crime_data.drop('date', axis=1)

In [8]:
crime_data.head()

Unnamed: 0,category,day,time,district,resolution,location,longitude,latitude,month,year
1,ROBBERY,Sunday,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,2,2015
2,ASSAULT,Sunday,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,2,2015
3,SECONDARY CODES,Sunday,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,2,2015
4,VANDALISM,Tuesday,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,1,2015
6,SECONDARY CODES,Saturday,21:00,BAYVIEW,NONE,700 Block of KIRKWOOD AV,-122.374019,37.729203,1,2015


In [9]:
interval = crime_data['time'].apply(discretizeTime)
crime_data['interval'] = interval

In [10]:
resolved = crime_data['resolution'].apply(categorizeResolution)
crime_data['resolved'] = resolved
crime_data = crime_data.drop('resolution', axis=1)

In [11]:
crime_data.head()

Unnamed: 0,category,day,time,district,location,longitude,latitude,month,year,interval,resolved
1,ROBBERY,Sunday,15:45,TENDERLOIN,300 Block of LEAVENWORTH ST,-122.414406,37.784191,2,2015,T4,No
2,ASSAULT,Sunday,15:45,TENDERLOIN,300 Block of LEAVENWORTH ST,-122.414406,37.784191,2,2015,T4,No
3,SECONDARY CODES,Sunday,15:45,TENDERLOIN,300 Block of LEAVENWORTH ST,-122.414406,37.784191,2,2015,T4,No
4,VANDALISM,Tuesday,19:00,NORTHERN,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,1,2015,T5,No
6,SECONDARY CODES,Saturday,21:00,BAYVIEW,700 Block of KIRKWOOD AV,-122.374019,37.729203,1,2015,T6,No


In [13]:
#near_crime = crime_data.iloc[:5,:].apply(lambda row: isNear((row['latitude'], row['longitude']), crime_data.iloc[:4,:], 1), axis=1)