### Import our libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

### Read our csv files into a dataframe

In [101]:
rawCallData = pd.read_csv('Data/Seattle_Police_Department_911_Incident_Response.csv')

rawWeatherData = pd.read_csv('Data/Seattle_Weather.csv')

rawEventData = pd.read_csv('Data/Special_Events_Permits.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### Lets take a look at our data frames to see what we've got 

In [77]:
rawCallData.head()

Unnamed: 0,CAD CDW ID,CAD Event Number,General Offense Number,Event Clearance Code,Event Clearance Description,Event Clearance SubGroup,Event Clearance Group,Event Clearance Date,Hundred Block Location,District/Sector,Zone/Beat,Census Tract,Longitude,Latitude,Incident Location,Initial Type Description,Initial Type Subgroup,Initial Type Group,At Scene Time
0,﻿15736,10000246357,2010246357,242.0,FIGHT DISTURBANCE,DISTURBANCES,DISTURBANCES,07/17/2010 08:49:00 PM,3XX BLOCK OF PINE ST,M,M2,8100.2001,-122.338147,47.610975,"(47.610975163, -122.338146748)",,,,
1,15737,10000246471,2010246471,65.0,THEFT - MISCELLANEOUS,THEFT,OTHER PROPERTY,07/17/2010 08:50:00 PM,36XX BLOCK OF DISCOVERY PARK BLVD,Q,Q1,5700.1012,-122.404613,47.658325,"(47.658324899, -122.404612874)",,,,
2,15738,10000246255,2010246255,250.0,"MISCHIEF, NUISANCE COMPLAINTS","NUISANCE, MISCHIEF COMPLAINTS","NUISANCE, MISCHIEF",07/17/2010 08:55:00 PM,21XX BLOCK OF 3RD AVE,M,M2,7200.2025,-122.342843,47.613551,"(47.613551471, -122.342843234)",,,,
3,15739,10000246473,2010246473,460.0,TRAFFIC (MOVING) VIOLATION,TRAFFIC RELATED CALLS,TRAFFIC RELATED CALLS,07/17/2010 09:00:00 PM,7XX BLOCK OF ROY ST,D,D1,7200.1002,-122.341847,47.625401,"(47.625401388, -122.341846999)",,,,
4,15740,10000246330,2010246330,250.0,"MISCHIEF, NUISANCE COMPLAINTS","NUISANCE, MISCHIEF COMPLAINTS","NUISANCE, MISCHIEF",07/17/2010 09:00:00 PM,9XX BLOCK OF ALOHA ST,D,D1,6700.1009,-122.339709,47.627425,"(47.627424837, -122.339708605)",,,,


### Let's start by cleaning up those column names

In [102]:
rawCallData.columns = rawCallData.columns.str.lower().str.replace(" ", "_").str.replace("/", "_") 
# Remove the white space and slashes in our column names
rawCallData.columns # Check our work

Index(['cad_cdw_id', 'cad_event_number', 'general_offense_number',
       'event_clearance_code', 'event_clearance_description',
       'event_clearance_subgroup', 'event_clearance_group',
       'event_clearance_date', 'hundred_block_location', 'district_sector',
       'zone_beat', 'census_tract', 'longitude', 'latitude',
       'incident_location', 'initial_type_description',
       'initial_type_subgroup', 'initial_type_group', 'at_scene_time'],
      dtype='object')

### Thats better, now lets examine the NaN values

In [79]:
print(rawCallData.isnull().sum())

cad_cdw_id                           0
cad_event_number                     0
general_offense_number               0
event_clearance_code             10797
event_clearance_description      10798
event_clearance_subgroup         10798
event_clearance_group            10798
event_clearance_date             10951
hundred_block_location            3487
district_sector                   1162
zone_beat                            1
census_tract                      2792
longitude                            1
latitude                             1
incident_location                    1
initial_type_description        577813
initial_type_subgroup           577813
initial_type_group              577813
at_scene_time                  1029344
dtype: int64


### We have a significant number of records missing some information
They seem to be clustered. There is a cluster of 10,790+ records missing event data, 577,813 records missing initial data and most of the records are missing the at scene time. The event clearance and initial type columns seem to all describe the same data, what the call was about. The at scene time and call clearance date similiarly overlap. The bad news is, most of the missing data pertains to what we care about, which is what happened and when. They good news is these columns seem to provide redundant information, so we can use one to impute the other. Let's see if there are any that provide no relevant information.

In [103]:
mask = (rawCallData.event_clearance_code.isnull() == True)  & (rawCallData.initial_type_subgroup.isnull() == True)
noEvent = rawCallData[mask] # See if we have any NaN in both description and intitial 
print(noEvent.shape)
print(len(noEvent.event_clearance_subgroup.isnull())) # Verify if they all miss clearance subgroup as well
print(len(noEvent.initial_type_subgroup.isnull())) # Verify if they all miss initial subgroup as well

(931, 19)
931
931


### We have 931 records with no event descriptor, we will have to remove them

In [106]:
print(rawCallData.shape)
tempDF = rawCallData.merge(noEvent, how='left', indicator=True) # Merge our subset and original with an indicator column
tempDF = tempDF[tempDF['_merge'] == 'left_only'] # Select those which are not duplicated
rawCallData = tempDF # Make that change to our original DF
del rawCallData['_merge'] # Clean up
del tempDF
print(rawCallData.shape) # Verify our subtraction

(1444135, 19)
(1444135, 19)


### Now we can start to add clearance codes for the records that have a description but no code
We will start by finding initial type descriptions that share an exact match with an event clearance description. Then we will use that shared description to find the code and impute it on the records with only an initial description.

In [124]:
initDesc = rawCallData.initial_type_description.dropna().unique() # Generate lists of the unique descriptions 
eventDesc = rawCallData.event_clearance_description.dropna().unique()
sharedDesc = np.intersect1d(initDesc, eventDesc) # Find our overlap
sharedDesc # This is not a very encouraging list length


array(['ABANDONED VEHICLE', 'AUTO RECOVERY', 'GAMBLING',
       'HARBOR - WATER EMERGENCIES', 'PROWLER',
       'SEX OFFENDER - FAILURE TO REGISTER', 'TRESPASS'], dtype=object)

### To find our codes
We will run a loop that looks up the first value in the series of clearance codes generated for each match between our list of overlapping codes sharedDesc and rawCallData.event_clearance_description, save that to a list and then create a dictionary of key value pairs from our description array, sharedDesc and our codes list descCode.

In [127]:
descCode = []
for i in sharedDesc:
    descCode.append(rawCallData.loc[rawCallData['event_clearance_description'] == i, 'event_clearance_code'].iloc[0])

codeDict = dict(zip(sharedDesc, descCode))

In [128]:
codeDict

{'ABANDONED VEHICLE': 410.0,
 'AUTO RECOVERY': 73.0,
 'GAMBLING': 121.0,
 'HARBOR - WATER EMERGENCIES': 342.0,
 'PROWLER': 160.0,
 'SEX OFFENDER - FAILURE TO REGISTER': 143.0,
 'TRESPASS': 161.0}

In [130]:
rawCallData['event_clearance_code'] = rawCallData['initial_type_description'].map(codeDict)

In [131]:
rawCallData.event_clearance_code.isnull().sum()

1407869