### Import our libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

### Read our csv files into a dataframe

In [2]:
rawCallData = pd.read_csv('Data/Seattle_Police_Department_911_Incident_Response.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### An error right out of the gate, let see what the problem is

In [3]:
rawCallData.dtypes

CAD CDW ID                      object
CAD Event Number                 int64
General Offense Number           int64
Event Clearance Code           float64
Event Clearance Description     object
Event Clearance SubGroup        object
Event Clearance Group           object
Event Clearance Date            object
Hundred Block Location          object
District/Sector                 object
Zone/Beat                       object
Census Tract                   float64
Longitude                      float64
Latitude                       float64
Incident Location               object
Initial Type Description        object
Initial Type Subgroup           object
Initial Type Group              object
At Scene Time                   object
dtype: object

### Let's start by cleaning up those column names in rawCallData

In [4]:
rawCallData.columns = rawCallData.columns.str.lower().str.replace(" ", "_").str.replace("/", "_") 
# Remove the white space and slashes in our column names
rawCallData.columns # Check our work

Index(['cad_cdw_id', 'cad_event_number', 'general_offense_number',
       'event_clearance_code', 'event_clearance_description',
       'event_clearance_subgroup', 'event_clearance_group',
       'event_clearance_date', 'hundred_block_location', 'district_sector',
       'zone_beat', 'census_tract', 'longitude', 'latitude',
       'incident_location', 'initial_type_description',
       'initial_type_subgroup', 'initial_type_group', 'at_scene_time'],
      dtype='object')

### That's better,  but an attempt to coerce the datatype leads us to the problems,
Let's take a minute and fix these

In [5]:
rawCallData.loc[rawCallData.cad_cdw_id == 'ï»¿1875911']

Unnamed: 0,cad_cdw_id,cad_event_number,general_offense_number,event_clearance_code,event_clearance_description,event_clearance_subgroup,event_clearance_group,event_clearance_date,hundred_block_location,district_sector,zone_beat,census_tract,longitude,latitude,incident_location,initial_type_description,initial_type_subgroup,initial_type_group,at_scene_time
857606,ï»¿1875911,14000012658,201412658,41.0,"HARASSMENT, THREATS","THREATS, HARASSMENT","THREATS, HARASSMENT",01/13/2014 12:51:00 AM,39XX BLOCK OF S JUNEAU ST,S,S1,10300.3006,-122.282103,47.549712,"(47.549711879, -122.282103194)",THREATS (INCLS IN-PERSON/BY PHONE/IN WRITING),"THREATS, HARASSMENT","THREATS, HARASSMENT",01/12/2014 08:00:00 PM


In [6]:
rawCallData.loc[857606,'cad_cdw_id'] = '1875911'

In [7]:
rawCallData.loc[rawCallData.cad_cdw_id == '\ufeff15736']

Unnamed: 0,cad_cdw_id,cad_event_number,general_offense_number,event_clearance_code,event_clearance_description,event_clearance_subgroup,event_clearance_group,event_clearance_date,hundred_block_location,district_sector,zone_beat,census_tract,longitude,latitude,incident_location,initial_type_description,initial_type_subgroup,initial_type_group,at_scene_time
0,﻿15736,10000246357,2010246357,242.0,FIGHT DISTURBANCE,DISTURBANCES,DISTURBANCES,07/17/2010 08:49:00 PM,3XX BLOCK OF PINE ST,M,M2,8100.2001,-122.338147,47.610975,"(47.610975163, -122.338146748)",,,,


In [8]:
rawCallData.loc[0, 'cad_cdw_id'] = '15736'

In [9]:
rawCallData.loc[rawCallData.cad_cdw_id == 'ï»¿880295']

Unnamed: 0,cad_cdw_id,cad_event_number,general_offense_number,event_clearance_code,event_clearance_description,event_clearance_subgroup,event_clearance_group,event_clearance_date,hundred_block_location,district_sector,zone_beat,census_tract,longitude,latitude,incident_location,initial_type_description,initial_type_subgroup,initial_type_group,at_scene_time
926010,ï»¿880295,12000276191,2012276191,71.0,AUTO THEFT,AUTO THEFTS,AUTO THEFTS,08/19/2012 03:16:00 PM,18XX BLOCK OF EASTLAKE AVE E,D,D2,6600.2,-122.325401,47.63537,"(47.635369535, -122.325400816)",AUTO THEFT - VEH THEFT OR THEFT & RECOVERY,AUTO THEFTS,AUTO RECOVERIES,08/19/2012 02:38:00 PM


In [10]:
rawCallData.loc[926010, 'cad_cdw_id'] = '880295'

In [11]:
rawCallData.cad_cdw_id = rawCallData.cad_cdw_id.astype('int64', errors='raise') # Coerce the data to int64

### Fixed that, on we go to our next dataframe

In [12]:
rawWeatherData = pd.read_csv('Data/Seattle_Weather.csv')

### Lets take a look at our data frames to see what we've got 

In [13]:
rawCallData.head()

Unnamed: 0,cad_cdw_id,cad_event_number,general_offense_number,event_clearance_code,event_clearance_description,event_clearance_subgroup,event_clearance_group,event_clearance_date,hundred_block_location,district_sector,zone_beat,census_tract,longitude,latitude,incident_location,initial_type_description,initial_type_subgroup,initial_type_group,at_scene_time
0,15736,10000246357,2010246357,242.0,FIGHT DISTURBANCE,DISTURBANCES,DISTURBANCES,07/17/2010 08:49:00 PM,3XX BLOCK OF PINE ST,M,M2,8100.2001,-122.338147,47.610975,"(47.610975163, -122.338146748)",,,,
1,15737,10000246471,2010246471,65.0,THEFT - MISCELLANEOUS,THEFT,OTHER PROPERTY,07/17/2010 08:50:00 PM,36XX BLOCK OF DISCOVERY PARK BLVD,Q,Q1,5700.1012,-122.404613,47.658325,"(47.658324899, -122.404612874)",,,,
2,15738,10000246255,2010246255,250.0,"MISCHIEF, NUISANCE COMPLAINTS","NUISANCE, MISCHIEF COMPLAINTS","NUISANCE, MISCHIEF",07/17/2010 08:55:00 PM,21XX BLOCK OF 3RD AVE,M,M2,7200.2025,-122.342843,47.613551,"(47.613551471, -122.342843234)",,,,
3,15739,10000246473,2010246473,460.0,TRAFFIC (MOVING) VIOLATION,TRAFFIC RELATED CALLS,TRAFFIC RELATED CALLS,07/17/2010 09:00:00 PM,7XX BLOCK OF ROY ST,D,D1,7200.1002,-122.341847,47.625401,"(47.625401388, -122.341846999)",,,,
4,15740,10000246330,2010246330,250.0,"MISCHIEF, NUISANCE COMPLAINTS","NUISANCE, MISCHIEF COMPLAINTS","NUISANCE, MISCHIEF",07/17/2010 09:00:00 PM,9XX BLOCK OF ALOHA ST,D,D1,6700.1009,-122.339709,47.627425,"(47.627424837, -122.339708605)",,,,


In [14]:
rawWeatherData.head()

Unnamed: 0,dt,dt_iso,city_id,city_name,lat,lon,temp,temp_min,temp_max,pressure,...,rain_today,snow_1h,snow_3h,snow_24h,snow_today,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1349096400,2012-10-01 13:00:00 +0000 UTC,5809844,,,,281.8,278.15,287.59,1027,...,,,,,,1,800,Clear,sky is clear,01n
1,1349186400,2012-10-02 14:00:00 +0000 UTC,5809844,,,,281.62,278.15,286.48,1046,...,,,,,,66,800,Clear,sky is clear,02d
2,1349190000,2012-10-02 15:00:00 +0000 UTC,5809844,,,,282.71,279.82,289.82,1026,...,,,,,,1,800,Clear,sky is clear,01d
3,1349193600,2012-10-02 16:00:00 +0000 UTC,5809844,,,,285.05,281.48,293.15,1026,...,,,,,,1,800,Clear,sky is clear,01d
4,1349197200,2012-10-02 17:00:00 +0000 UTC,5809844,,,,287.97,282.59,296.48,1027,...,,,,,,1,800,Clear,sky is clear,01d


### Lets examine the NaN values, starting with rawCallData

In [16]:
print(rawCallData.isnull().sum())

cad_cdw_id                           0
cad_event_number                     0
general_offense_number               0
event_clearance_code             10797
event_clearance_description      10798
event_clearance_subgroup         10798
event_clearance_group            10798
event_clearance_date             10951
hundred_block_location            3487
district_sector                   1162
zone_beat                            1
census_tract                      2792
longitude                            1
latitude                             1
incident_location                    1
initial_type_description        577813
initial_type_subgroup           577813
initial_type_group              577813
at_scene_time                  1029344
dtype: int64


### We have a significant number of records missing some information
They seem to be clustered. There is a cluster of 10,790+ records missing event data, 577,813 records missing initial data and a third large cluster where most of the records are missing  at scene time. The event clearance and initial type columns seem to all describe the same data, what the call was about. The at scene time and call clearance date similiarly overlap. The bad news is, most of the missing data pertains to what we care about, which is what happened and when. They good news is these columns seem to provide redundant information, so we can use one to impute the other. We can see a few columns provide no relevant information and can be dropped, subgroup and group. Let's see if there are any records that provide no relevant information.

In [17]:
rawCallData.drop(['event_clearance_subgroup', 'event_clearance_group', 'initial_type_subgroup', 
                  'initial_type_group'], axis=1,inplace=True)

In [18]:
mask = (rawCallData.event_clearance_description.isnull()) & (rawCallData.initial_type_description.isnull())
noEvent = rawCallData[mask] # Our mask selects records that have a null value in both description columns
print(noEvent.shape) # Check the size of our haul

(932, 15)


### We have 932 records with no event descriptor, we will have to remove them,
While not shown here we also checked against the other descriptive columns

In [19]:
print(rawCallData.shape) # Check original datafile shape
rawCallData = rawCallData[~mask] # Remove by selecting the inverse of our mask as subset
print(rawCallData.shape) # Verify our subtraction

(1445066, 15)
(1444134, 15)


In [20]:
mask2 = (rawCallData.at_scene_time.isnull()) & (rawCallData.event_clearance_date.isnull())
noTime = rawCallData[mask2] # Our mask selects records that have a null value in both time columns
print(noTime.shape) # Check the size of our haul

(9, 15)


### We have 9 records with no time, we will have to remove them

In [21]:
print(rawCallData.shape) # Check original datafile shape
rawCallData = rawCallData[~mask2] # Remove by selecting the inverse of our mask as subset
print(rawCallData.shape) # Verify our subtraction

(1444134, 15)
(1444125, 15)


### Looking again at the time values,
We can only use the overlap in our data sets date range, so lets switch gears and make sure that our dataframes cover the same time period. Continuing to clean up the null values in rawCallData could be a waste if those records don't overlap with our weather data. We will start by cleaning up the assorted time columns and getting one formatted datetime column.

In [22]:
rawCallData['formatted_time'] = pd.to_datetime(rawCallData.event_clearance_date, 
                                               format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

In [23]:
rawCallData.formatted_time.head()

0   2010-07-17 20:49:00
1   2010-07-17 20:50:00
2   2010-07-17 20:55:00
3   2010-07-17 21:00:00
4   2010-07-17 21:00:00
Name: formatted_time, dtype: datetime64[ns]

In [24]:
rawWeatherData['formatted_time'] = pd.to_datetime(rawWeatherData.dt, unit='s')

In [25]:
print(rawWeatherData.formatted_time.head())
print(rawWeatherData.head())

0   2012-10-01 13:00:00
1   2012-10-02 14:00:00
2   2012-10-02 15:00:00
3   2012-10-02 16:00:00
4   2012-10-02 17:00:00
Name: formatted_time, dtype: datetime64[ns]
           dt                         dt_iso  city_id  city_name  lat  lon  \
0  1349096400  2012-10-01 13:00:00 +0000 UTC  5809844        NaN  NaN  NaN   
1  1349186400  2012-10-02 14:00:00 +0000 UTC  5809844        NaN  NaN  NaN   
2  1349190000  2012-10-02 15:00:00 +0000 UTC  5809844        NaN  NaN  NaN   
3  1349193600  2012-10-02 16:00:00 +0000 UTC  5809844        NaN  NaN  NaN   
4  1349197200  2012-10-02 17:00:00 +0000 UTC  5809844        NaN  NaN  NaN   

     temp  temp_min  temp_max  pressure         ...          snow_1h  snow_3h  \
0  281.80    278.15    287.59      1027         ...              NaN      NaN   
1  281.62    278.15    286.48      1046         ...              NaN      NaN   
2  282.71    279.82    289.82      1026         ...              NaN      NaN   
3  285.05    281.48    293.15      1026    

### We can see this matches the UTC time given in the next column, so we need to convert to PDT.
This is a three step process, first changing our naive time to UTC zone aware time, then converting to PDT through a lambda function. We can't use tz_convert inline because it defaults to changing the index rather than the value. We then undo making it zone aware with tz_localize as another lambda function.

In [26]:
rawWeatherData.formatted_time = rawWeatherData.formatted_time.dt.tz_localize('UTC')
rawWeatherData.formatted_time = rawWeatherData.formatted_time.apply(lambda x: x.tz_convert('America/Los_Angeles'))
rawWeatherData.formatted_time = rawWeatherData.formatted_time.apply(lambda x: x.tz_localize(None))

rawWeatherData.formatted_time.head()

0   2012-10-01 06:00:00
1   2012-10-02 07:00:00
2   2012-10-02 08:00:00
3   2012-10-02 09:00:00
4   2012-10-02 10:00:00
Name: formatted_time, dtype: datetime64[ns]

In [27]:
mask3 = (rawCallData.formatted_time > '2012-10-01 13:00:00')
rawCallData = rawCallData[mask3]

In [28]:
rawCallData.shape

(879393, 16)

In [29]:
rawCallData.isnull().sum()

cad_cdw_id                          0
cad_event_number                    0
general_offense_number              0
event_clearance_code                0
event_clearance_description         0
event_clearance_date                0
hundred_block_location              1
district_sector                   937
zone_beat                           0
census_tract                     1615
longitude                           0
latitude                            0
incident_location                   0
initial_type_description        72663
at_scene_time                  516351
formatted_time                      0
dtype: int64

# This is an important learning note:
# _Always start with a plan_
To be honest when cleaning this data, I just started with the 911 call data, looking for ways to clean out the NaN values. I'm leaving the original plan in this notebook, which was to use one column to impute the other, without regard to the bigger picture. This involved some neat data cleaning tricks, but in the end would have been completely wasted work. Without the corresponding weather data, those records are worthless to us. Luckily, I stopped and thought it out after hitting a few road blocks, realizing that the work might not be necessary. Turns out had I stopped and made a plan in the first place, I could've save quite a bit of time.
## _Always make a plan and then tackle your data, don't just start coding_
### Duly noted, on we go

### The columns with NaN values still are offering us redundant information about the description or location and can be dropped

In [30]:
rawCallData.drop(['event_clearance_date', 'hundred_block_location', 'district_sector', 'census_tract',
                 'incident_location', 'initial_type_description', 'at_scene_time'], inplace=True, axis=1)
rawCallData.shape

(879393, 9)

In [31]:
print(rawCallData.isnull().sum())
rawCallData.head()

cad_cdw_id                     0
cad_event_number               0
general_offense_number         0
event_clearance_code           0
event_clearance_description    0
zone_beat                      0
longitude                      0
latitude                       0
formatted_time                 0
dtype: int64


Unnamed: 0,cad_cdw_id,cad_event_number,general_offense_number,event_clearance_code,event_clearance_description,zone_beat,longitude,latitude,formatted_time
49,1658027,16000028163,201628163,245.0,"DISTURBANCE, OTHER",N2,-122.34777,47.731678,2016-01-24 11:54:55
70,1658028,16000028161,201628161,280.0,SUSPICIOUS PERSON,S1,-122.280685,47.523026,2016-01-24 11:57:35
105,1658029,16000028159,201628159,65.0,THEFT - MISCELLANEOUS,M1,-122.342,47.609535,2016-01-24 11:54:28
190,1658030,16000028134,201628134,200.0,ALACAD - COMMERCIAL BURGLARY (FALSE),U1,-122.31302,47.668995,2016-01-24 11:53:22
255,1658031,16000028114,201628114,161.0,TRESPASS,D2,-122.34467,47.64158,2016-01-24 11:59:45


### Now we can look to see if the event numbers and ID numbers offer unique ID's

In [32]:
print(rawCallData.cad_cdw_id.nunique())
print(rawCallData.cad_event_number.nunique())
print(rawCallData.general_offense_number.nunique())

879366
878480
878480


In [33]:
rawCallData[rawCallData.cad_cdw_id.duplicated()]

Unnamed: 0,cad_cdw_id,cad_event_number,general_offense_number,event_clearance_code,event_clearance_description,zone_beat,longitude,latitude,formatted_time
835500,1873964,14000010381,201410381,64.0,SHOPLIFT,N3,-122.324615,47.708603,2014-01-10 15:44:00
835504,1873952,16000405795,2016405795,280.0,SUSPICIOUS PERSON,B1,-122.38563,47.67061,2016-11-09 11:31:57
835506,1873969,16000405783,2016405783,470.0,PARKING VIOLATION (EXCEPT ABANDONED VEHICLES),C3,-122.29553,47.62301,2016-11-09 11:44:46
1131303,1873957,16000405748,2016405748,245.0,"DISTURBANCE, OTHER",B2,-122.37318,47.673977,2016-11-09 11:32:22
1131304,1873958,16000405734,2016405734,470.0,PARKING VIOLATION (EXCEPT ABANDONED VEHICLES),B1,-122.378334,47.668423,2016-11-09 11:22:51
1133136,1873928,16000405451,2016405451,450.0,DRIVING WHILE UNDER INFLUENCE (DUI),U1,-122.31303,47.66944,2016-11-09 09:47:05
1133166,1873926,16000405578,2016405578,64.0,SHOPLIFT,D1,-122.34854,47.616524,2016-11-09 09:48:19
1133167,1873927,16000405572,2016405572,282.0,SUSPICIOUS CIRCUMSTANCES - BUILDING (OPEN DOOR...,D1,-122.3461,47.615475,2016-11-09 09:50:21
1133185,1873923,16000405632,2016405632,430.0,MOTOR VEHICLE COLLISION,F1,-122.33214,47.534145,2016-11-09 09:59:05
1133280,1873921,16000405658,2016405658,63.0,THEFT - CAR PROWL,U1,-122.302925,47.669205,2016-11-09 09:51:28


In [34]:
rawCallData.sort_values(by=['cad_cdw_id'],inplace=True) # Sort by the cad_cdw_id

In [35]:
 rawCallData.loc[rawCallData.cad_cdw_id.duplicated(keep=False) == True] # Now we can see our duplicates

Unnamed: 0,cad_cdw_id,cad_event_number,general_offense_number,event_clearance_code,event_clearance_description,zone_beat,longitude,latitude,formatted_time
835468,1873921,14000010389,201410389,184.0,"NARCOTICS, OTHER",K1,-122.338247,47.610246,2014-01-10 15:03:00
1133280,1873921,16000405658,2016405658,63.0,THEFT - CAR PROWL,U1,-122.302925,47.669205,2016-11-09 09:51:28
1133281,1873922,16000405655,2016405655,460.0,TRAFFIC (MOVING) VIOLATION,J1,-122.36472,47.700382,2016-11-09 10:01:35
835469,1873922,14000010384,201410384,64.0,SHOPLIFT,F2,-122.369349,47.521037,2014-01-10 14:51:00
1133185,1873923,16000405632,2016405632,430.0,MOTOR VEHICLE COLLISION,F1,-122.33214,47.534145,2016-11-09 09:59:05
835470,1873923,14000010370,201410370,470.0,PARKING VIOLATION (EXCEPT ABANDONED VEHICLES),D1,-122.34427,47.62627,2014-01-10 14:57:00
1133166,1873926,16000405578,2016405578,64.0,SHOPLIFT,D1,-122.34854,47.616524,2016-11-09 09:48:19
835471,1873926,14000010083,201410083,363.0,MISSING PERSON,K2,-122.331124,47.601718,2014-01-10 14:58:00
835472,1873927,14000010066,201410066,363.0,MISSING PERSON,G3,-122.297222,47.593315,2014-01-10 14:55:00
1133167,1873927,16000405572,2016405572,282.0,SUSPICIOUS CIRCUMSTANCES - BUILDING (OPEN DOOR...,D1,-122.3461,47.615475,2016-11-09 09:50:21


### Well we can see here that these are definitely different events, so our ID numbers are not unique identifiers, lets check cad_event number next

In [36]:
rawCallData.cad_event_number = rawCallData.cad_event_number.astype('int64', errors='raise')

In [37]:
rawCallData.sort_values(by=['cad_event_number'], inplace=True)

In [38]:
rawCallData.loc[rawCallData.cad_event_number.duplicated(keep=False) == True]

Unnamed: 0,cad_cdw_id,cad_event_number,general_offense_number,event_clearance_code,event_clearance_description,zone_beat,longitude,latitude,formatted_time
997757,931933,12000252943,2012252943,63.0,THEFT - CAR PROWL,G1,-122.302771,47.608007,2012-10-15 11:05:00
1285548,579794,12000252943,2012252943,63.0,THEFT - CAR PROWL,G2,-122.302771,47.608007,2012-10-15 11:05:00
1185456,2142696,14000275708,2014275708,63.0,THEFT - CAR PROWL,N1,-122.338090,47.712302,2014-08-19 13:25:00
1185694,2143025,14000275708,2014275708,65.0,THEFT - MISCELLANEOUS,N1,-122.338090,47.712302,2014-08-19 19:35:00
1199203,2157228,14000314989,2014314989,71.0,AUTO THEFT,O3,-122.321684,47.546745,2014-09-20 13:08:00
1200366,2158430,14000314989,2014314989,71.0,AUTO THEFT,O3,-122.321684,47.546745,2014-09-20 13:08:00
1264603,2230530,14000422775,2014422775,71.0,AUTO THEFT,O3,-122.293751,47.538912,2014-12-22 22:02:00
1266669,2234938,14000422775,2014422775,71.0,AUTO THEFT,O3,-122.293751,47.538912,2014-12-22 22:02:00
1265028,2230986,14000423288,2014423288,71.0,AUTO THEFT,N3,-122.324615,47.708603,2014-12-23 15:02:00
1266801,2235076,14000423288,2014423288,71.0,AUTO THEFT,N3,-122.324615,47.708603,2014-12-23 15:02:00


### So here we have duplicated records, and a valid ID column, 
Which we could just use the index and get rid of these ID columns altogether, but I would like to have a name for each record that will travel with the record as it gets moved into different dataframes as we merge in the future.
### We will drop our duplicates

In [39]:
rawCallData = rawCallData.drop_duplicates(['cad_event_number'], keep='first')

In [40]:
rawCallData.drop(['general_offense_number', 'cad_cdw_id'], axis=1, inplace=True)

In [41]:
rawCallData.event_clearance_code = rawCallData.event_clearance_code.astype('int64', errors='raise')
rawCallData.sort_values(by=['event_clearance_code'],inplace=True) 

# So now we have a nice clean dataframe that just needs some feature engineering,
We wil start with the types of events. There are 111 different codes, let's see if we can bin these into categories based on the code. First we will make a code book to see what each code means. Then we will bin the records into general types of calls.

In [42]:
eventCode = rawCallData.event_clearance_code
eventDesc = rawCallData.event_clearance_description

eventBook = dict(zip(eventCode, eventDesc))

eventBook


{10: 'HOMICIDE',
 30: 'ARMED ROBBERY',
 31: 'STRONG ARM ROBBERY',
 40: 'ASSAULTS, OTHER',
 41: 'HARASSMENT, THREATS',
 42: 'HARASSMENT, THREATS - BY TELEPHONE, WRITING',
 43: 'ASSAULTS, FIREARM INVOLVED',
 49: 'ASSAULTS, GANG RELATED',
 50: 'BURGLARY - RESIDENTIAL, UNOCCUPIED',
 51: 'BURGLARY - RESIDENTIAL, OCCUPIED',
 52: 'BURGLARY - COMMERCIAL',
 53: 'BURGLARY - UNOCCUPIED STRUCTURE ON RESIDENTIAL PROPERTY',
 61: 'THEFT - AUTO ACCESSORIES',
 62: 'BICYCLE THEFT',
 63: 'THEFT - CAR PROWL',
 64: 'SHOPLIFT',
 65: 'THEFT - MISCELLANEOUS',
 71: 'AUTO THEFT',
 72: 'AUTO THEFT AND RECOVERY',
 73: 'AUTO RECOVERY',
 74: 'LICENSE PLATE THEFT OR LOSS',
 92: 'RECKLESS BURNING',
 100: 'FRAUD (INCLUDING IDENTITY THEFT)',
 101: 'FORGERY, BAD CHECKS',
 121: 'GAMBLING',
 122: 'LIQUOR VIOLATIONS (BUSINESS)',
 124: 'PORNOGRAPHY',
 125: 'PROSTITUTION',
 126: 'VICE, OTHER',
 127: 'SOAP (STAY OUT OF AREA OF PROSTITUTION) ORDER VIOLATION',
 130: 'PROPERTY DESTRUCTION',
 139: 'GANG GRAFFITI',
 141: 'LEWD CON

### The codes seem to follow a pattern of general types of crime being grouped into a ten base system. 
The system skips around a little bit and we want to hold out some of the codes as individual bins, so we can't just linspace to generate our bins. We will have to manually create a list of our bins unfortunately, then do the same for our labels. We can then add our bins and labels to the rawCallData. Then we can tackle our other categorical variable, zone_beat

In [43]:
bins = [0,10,39,49,59,69,79,92,109,129,139,149,169,171,177,179,189,199,209,229,259,269,289,299,323,339,346,
        347,359,369,379,389,449,450,495,620]
categories = ['homicide', 'robbery', 'assault', 'burglary', 'theft', 'car_theft', 'reckless_burning', 'fraud', 
          'vice', 'property', 'lewd_behavior', 'trespass', 'park_violation', 'liquor_violation', 'drive_by_shooting', 
          'narcotics', 'warrant_service', 'security_alarms', 'mental_health', 'disturbance', 'animals', 
          'suspicious_case', 'person_with_weapon', 'harbor_theft','casualty', 'harbor_boat_assist', 'harbor_dui', 
          'hazards', 'missing_person', 'missing_property', 'crowd_control', 'traffic_general', 'dui', 
          'traffic_minor', 'human_trafficking']
rawCallData['event_type'] = pd.cut(rawCallData['event_clearance_code'], bins, labels=categories)
rawCallData['bins'] = pd.cut(rawCallData['event_clearance_code'], bins) # Add two columns with our bins and labels

In [44]:
pd.value_counts(rawCallData['event_type']) # Take a look at our totals by bin

disturbance           152722
traffic_minor         128685
suspicious_case       126012
theft                  83536
traffic_general        63670
liquor_violation       46224
security_alarms        41753
trespass               33340
burglary               26335
assault                25241
car_theft              21647
mental_health          16839
narcotics              15903
property               14435
hazards                13861
dui                    11030
missing_property       11015
fraud                  10713
warrant_service         6518
robbery                 5118
casualty                4657
park_violation          4465
missing_person          3945
lewd_behavior           2743
animals                 2394
person_with_weapon      2278
vice                    1610
harbor_boat_assist       733
drive_by_shooting        381
crowd_control            374
reckless_burning         157
homicide                  93
harbor_theft              45
harbor_dui                 5
human_traffick

### So, we have managed to cut down our 111 codes into 35 categories and binned the data as such.
We will now use the event type column to create our dummy columns and do the same with zone_beat.

In [45]:
rawCallData = pd.concat([rawCallData, pd.get_dummies(rawCallData['event_type'])], axis=1)

In [46]:
rawCallData = pd.concat([rawCallData, pd.get_dummies(rawCallData['zone_beat'])], axis=1)
rawCallData.columns = rawCallData.columns.str.lower() # Fix our column names 

In [47]:
print(rawCallData.shape)
rawCallData.head()

(878480, 133)


Unnamed: 0,cad_event_number,event_clearance_code,event_clearance_description,zone_beat,longitude,latitude,formatted_time,event_type,bins,homicide,...,u1,u2,u3,us,w,w1,w2,w3,wp,ws
1079591,17000056904,10,HOMICIDE,K3,-122.32506,47.59793,2017-02-15 21:15:45,homicide,"(0, 10]",1,...,0,0,0,0,0,0,0,0,0,0
196931,16000050525,10,HOMICIDE,D1,-122.34586,47.61299,2016-02-11 13:21:02,homicide,"(0, 10]",1,...,0,0,0,0,0,0,0,0,0,0
1253712,14000405538,10,HOMICIDE,O3,-122.319282,47.54983,2014-12-07 14:00:00,homicide,"(0, 10]",1,...,0,0,0,0,0,0,0,0,0,0
525375,17000073072,10,HOMICIDE,B3,-122.33886,47.661385,2017-02-28 17:10:27,homicide,"(0, 10]",1,...,0,0,0,0,0,0,0,0,0,0
1359327,17000298506,10,HOMICIDE,G3,-122.31153,47.59587,2017-08-13 01:56:59,homicide,"(0, 10]",1,...,0,0,0,0,0,0,0,0,0,0


# Looks like we finally have a nicely cleaned data frame, although it has some extra columns from our binning and making dummy columns.
We will go ahead and save this data frame to disk and create a codebook in markdown so anyone else who wants to work with this data in the future (most likely us) will have a nice and organized dataframe that can be used with dummy columns or without, binned events or not, and a nice record of what each column means. Then we can clean it up for our present use and change the name to just callData.

In [49]:
rawCallData.to_pickle('rawCallData.pkl') # We will save it as a pickle file, since we love python so much

In [53]:
callData = rawCallData.drop(['event_clearance_code', 'event_clearance_description', 
                             'zone_beat', 'event_type', 'bins'], axis=1) # Drop 'em like they're hot 

In [55]:
del rawCallData # We have great memory, the best memory, it's yuuuge, but waste not want not
callData.head()

Unnamed: 0,cad_event_number,longitude,latitude,formatted_time,homicide,robbery,assault,burglary,theft,car_theft,...,u1,u2,u3,us,w,w1,w2,w3,wp,ws
1079591,17000056904,-122.32506,47.59793,2017-02-15 21:15:45,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196931,16000050525,-122.34586,47.61299,2016-02-11 13:21:02,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1253712,14000405538,-122.319282,47.54983,2014-12-07 14:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
525375,17000073072,-122.33886,47.661385,2017-02-28 17:10:27,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1359327,17000298506,-122.31153,47.59587,2017-08-13 01:56:59,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Now we will tackle our other dataset, rawWeather

In [58]:
print(rawWeatherData.shape)
rawWeatherData.head()

(38335, 29)


Unnamed: 0,dt,dt_iso,city_id,city_name,lat,lon,temp,temp_min,temp_max,pressure,...,snow_1h,snow_3h,snow_24h,snow_today,clouds_all,weather_id,weather_main,weather_description,weather_icon,formatted_time
0,1349096400,2012-10-01 13:00:00 +0000 UTC,5809844,,,,281.8,278.15,287.59,1027,...,,,,,1,800,Clear,sky is clear,01n,2012-10-01 06:00:00
1,1349186400,2012-10-02 14:00:00 +0000 UTC,5809844,,,,281.62,278.15,286.48,1046,...,,,,,66,800,Clear,sky is clear,02d,2012-10-02 07:00:00
2,1349190000,2012-10-02 15:00:00 +0000 UTC,5809844,,,,282.71,279.82,289.82,1026,...,,,,,1,800,Clear,sky is clear,01d,2012-10-02 08:00:00
3,1349193600,2012-10-02 16:00:00 +0000 UTC,5809844,,,,285.05,281.48,293.15,1026,...,,,,,1,800,Clear,sky is clear,01d,2012-10-02 09:00:00
4,1349197200,2012-10-02 17:00:00 +0000 UTC,5809844,,,,287.97,282.59,296.48,1027,...,,,,,1,800,Clear,sky is clear,01d,2012-10-02 10:00:00


### We know for starters that we have our time column squared away already, so we can lose the old columns

In [60]:
rawWeatherData = rawWeatherData.drop(['dt', 'dt_iso'], axis=1)

### It appears that our temperature columns are in Kelvin,
so we will need to convert those. We only need the temp, we don't need the range for each hour, so let's drop min and max while we're at it

In [62]:
rawWeatherData = rawWeatherData.drop(['temp_min', 'temp_max'], axis=1)

In [68]:
rawWeatherData.temp = rawWeatherData.temp.apply(lambda x: (x*(9/5))-459.67)

In [69]:
rawWeatherData.temp.head()

0    47.570
1    47.246
2    49.208
3    53.420
4    58.676
Name: temp, dtype: float64

### That is oddly specific, let's coerce to int64

In [70]:
rawWeatherData.temp = rawWeatherData.temp.astype('int64', errors='raise')

### We also know that we are only working with data from one city, Seattle, and we will use this data for the whole city, so we can lose the geo information as well

In [72]:
rawWeatherData = rawWeatherData.drop(['city_id', 'city_name', 'lat', 'lon', 'sea_level', 'grnd_level'], axis=1)

In [73]:
rawWeatherData.head()

Unnamed: 0,temp,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,rain_24h,rain_today,snow_1h,snow_3h,snow_24h,snow_today,clouds_all,weather_id,weather_main,weather_description,weather_icon,formatted_time
0,47,1027,81,0,0,,,,,,,,,1,800,Clear,sky is clear,01n,2012-10-01 06:00:00
1,47,1046,67,1,180,,,,,,,,,66,800,Clear,sky is clear,02d,2012-10-02 07:00:00
2,49,1026,87,2,140,,,,,,,,,1,800,Clear,sky is clear,01d,2012-10-02 08:00:00
3,53,1026,76,0,0,,,,,,,,,1,800,Clear,sky is clear,01d,2012-10-02 09:00:00
4,58,1027,58,2,30,,,,,,,,,1,800,Clear,sky is clear,01d,2012-10-02 10:00:00
