# Data Preprocessing

This notebook is used to clean, reduce, transform, integrate and discretize the final data set.

In [1]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
import pickle

## Methods

In [2]:
def discretizeTime(time):
    '''
    function to map 24-hours format time to one of the six 4 hours intervals.
    '''
    if '00:00' <= time < '04:00':
        return 't1'
    elif '04:00' <= time < '08:00':
        return 't2'
    elif '08:00' <= time < '12:00':
        return 't3'
    elif '12:00' <= time < '16:00':
        return 't4'
    elif '16:00' <= time < '20:00':
        return 't5'
    elif '20:00' <= time < '24:00':
        return 't6'

In [3]:
def categorizeResolution(status):
    '''
    function to tag a resolution as No if the crime is not resolved, otherwise Yes.
    '''
    if status == 'NONE':
        return 'no'
    else:
        return 'yes'

In [4]:
def calculateDistance(src, dst):
    '''
    function to calculate the distance between two locations on earth
    using src & dst tuples given in the format (latitude, longitude).
    '''
    # approximate radius of earth in km
    R = 6373.0

    # approximate 1 km to miles conversion
    to_miles = 0.621371

    lat1 = radians(abs(src[0]))
    lon1 = radians(abs(src[1]))
    lat2 = radians(abs(dst[0]))
    lon2 = radians(abs(dst[1]))

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c * to_miles

In [5]:
def isNear(location, data, radius):
    '''
    function to determine if the given location (latitude, longitude)
    is near to any location in the given data (dataframe) based on the given radius.
    '''
    for index, row in data.iterrows():
        if calculateDistance(location, (row['latitude'], row['longitude'])) <= radius:
            return 'yes'
    return 'no'

In [6]:
def labelCategory(category):
    '''
    function to label a data point as per crime category.
    '''
    low = ["trespass", "bribery", "bad checks", "drunkenness", "suicide", "runaway", "family offenses", "loitering", "trea", "liquor laws", "warrants", "other offenses", "forgery/counterfeiting", "sex offenses, non forcible"]
    moderate = ["arson", "driving under the influence", "stolen property", "prostitution", "recovered vehicle", "suspicious occ", "pornography/obscene mat" , "disorderly conduct"]
    high = ["vehicle theft", "weapon laws", "vandalism", "assault", "robbery", "sex offenses, forcible", "missing person", "larceny/theft", "kidnapping", "fraud", "extortion", "burglary", "drug/narcotic" ]
    if category in low:
        return 'low'
    elif category in moderate:
        return 'moderate'
    elif category in high:
        return 'high'

## Process

In [7]:
crime_data = pd.read_csv('data/crime_data.csv')

Clean the data. Replace 'nan' values with 'N/A'. Drop columns that do not help with the goal. Remove rows that do not fall under the goal criteria.

In [8]:
crime_data = crime_data.replace(np.nan, 'N/A')
crime_data = crime_data.drop(['IncidntNum', 'Descript', 'Location', 'PdId'], axis=1)
crime_data = crime_data[crime_data.PdDistrict != 'N/A']
crime_data = crime_data[crime_data.Category != 'NON-CRIMINAL']
crime_data = crime_data[crime_data.Category != 'SECONDARY CODES']
crime_data = crime_data[crime_data.Category != 'GAMBLING']
crime_data = crime_data[crime_data.Category != 'EMBEZZLEMENT']

Modify the column names and values to match the scenario and neccesity.

In [9]:
crime_data = crime_data.rename(str.lower, axis='columns')
crime_data = crime_data.rename(index=str, columns={"dayofweek": "day", "pddistrict": "district", "x": "longitude", "y": "latitude"})
crime_data['category'] = crime_data['category'].apply(str.lower)
crime_data['day'] = crime_data['day'].apply(str.lower)
crime_data['district'] = crime_data['district'].apply(str.lower)
crime_data['address'] = crime_data['address'].apply(str.lower)

Split the 'date' column into 'month' and 'year' for better classification.

In [10]:
date = crime_data['date'].str.split('/')
month = date.apply(lambda x: x[0])
year = date.apply(lambda x: x[2])
crime_data['month'] = month
crime_data['year'] = year
crime_data = crime_data.drop('date', axis=1)

Discretize 'time' column to be represented using 6 interval classes.

In [11]:
interval = crime_data['time'].apply(discretizeTime)
crime_data['interval'] = interval

Create a 'resolved' column to represent if a crime report was resolved or not.

In [12]:
resolved = crime_data['resolution'].apply(categorizeResolution)
crime_data['resolved'] = resolved
crime_data = crime_data.drop('resolution', axis=1)

Externally label the data points to reflect what is to be achieved.

In [13]:
label = crime_data['category'].apply(labelCategory)
crime_data['label'] = label

In [14]:
crime_data.head()

Unnamed: 0,category,day,time,district,address,longitude,latitude,month,year,interval,resolved,label
1,robbery,sunday,15:45,tenderloin,300 block of leavenworth st,-122.414406,37.784191,2,2015,t4,no,high
2,assault,sunday,15:45,tenderloin,300 block of leavenworth st,-122.414406,37.784191,2,2015,t4,no,high
4,vandalism,tuesday,19:00,northern,lombard st / laguna st,-122.431119,37.800469,1,2015,t5,no,high
7,vandalism,saturday,21:00,bayview,700 block of kirkwood av,-122.374019,37.729203,1,2015,t6,no,high
8,burglary,saturday,16:09,central,200 block of stockton st,-122.406568,37.787809,1,2015,t5,no,high


Integrate support datasets into crime data.

In [15]:
#near_crime = crime_data.iloc[:5,:].apply(lambda row: isNear((row['latitude'], row['longitude']), crime_data.iloc[:4,:], 1), axis=1)

Save the dataframe as a pickle to store directory.

In [16]:
crime_data.to_pickle('store/crime_data.pkl')