In [1]:
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import Markdown

%matplotlib inline

In [2]:
os.listdir('.')

['.gitignore',
 '.ipynb_checkpoints',
 'chicago_crimes_eda.ipynb',
 'README.md',
 'Chicago_crimes.csv',
 '.git',
 'fbi_codes.csv']

In [3]:
df = pd.read_csv('Chicago_crimes.csv')
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,10000092,HY189866,2015-03-18 19:44:00,047XX W OHIO ST,041A,BATTERY,AGGRAVATED: HANDGUN,STREET,False,False,...,28.0,25.0,04B,1144606.0,1903566.0,2015,02/10/2018 03:50:01 PM,41.891399,-87.744385,"(41.891398861, -87.744384567)"
1,10000094,HY190059,2015-03-18 23:00:00,066XX S MARSHFIELD AVE,4625,OTHER OFFENSE,PAROLE VIOLATION,STREET,True,False,...,15.0,67.0,26,1166468.0,1860715.0,2015,02/10/2018 03:50:01 PM,41.773372,-87.665319,"(41.773371528, -87.665319468)"
2,10000095,HY190052,2015-03-18 22:45:00,044XX S LAKE PARK AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,...,4.0,39.0,08B,1185075.0,1875622.0,2015,02/10/2018 03:50:01 PM,41.813861,-87.596643,"(41.81386068, -87.596642837)"
3,10000096,HY190054,2015-03-18 22:30:00,051XX S MICHIGAN AVE,0460,BATTERY,SIMPLE,APARTMENT,False,False,...,3.0,40.0,08B,1178033.0,1870804.0,2015,02/10/2018 03:50:01 PM,41.800802,-87.622619,"(41.800802415, -87.622619343)"
4,10000097,HY189976,2015-03-18 21:00:00,047XX W ADAMS ST,031A,ROBBERY,ARMED: HANDGUN,SIDEWALK,False,False,...,28.0,25.0,03,1144920.0,1898709.0,2015,02/10/2018 03:50:01 PM,41.878065,-87.743354,"(41.878064761, -87.743354013)"


In [4]:
df.dtypes

ID                               int64
Case Number                     object
Date                    datetime64[ns]
Block                           object
IUCR                            object
Primary Type                    object
Description                     object
Location Description            object
Arrest                            bool
Domestic                          bool
Beat                             int64
District                       float64
Ward                           float64
Community Area                 float64
FBI Code                        object
X Coordinate                   float64
Y Coordinate                   float64
Year                             int64
Updated On                      object
Latitude                       float64
Longitude                      float64
Location                        object
dtype: object

In [5]:
# remove unuseful columns
df.drop(['Case Number', 'Updated On', 'Location'], axis=1, inplace=True)

In [6]:
dtypes = {'ID': 'uint32', 'Beat': 'uint16', 'Year': 'uint16', 'District': 'uint8', 'Ward': 'uint8', 
          'Community Area': 'uint8', 'X Coordinate': 'uint32', 'Y Coordinate': 'uint32'}

In [7]:
# Fill unknown values and convert to less consumin datatype
for key, value in dtypes.items():
    if value == 'uint8':
        df[key] = df[key].fillna(255)
    else:
        df[key] = df[key].fillna(3000000)
    df[key] = df[key].astype(value)

In [8]:
# Dataframe to map column 'FBI Code' and human readable description
if 'fbi_codes.csv' not in os.listdir('.'):
    fbi_df = pd.DataFrame({'Code': [], 'Description': [], 'Serious offense': []})
    temp = pd.read_html('http://gis.chicagopolice.org/clearmap_crime_sums/crime_types.html')
    # pattern that retrive desription and code
    pattern = re.compile(r' ([\w& -]*) (\(\d{2}[A-Z]?\))')
    # Retrive information about National Incident-Based Reporting System (NIBRS) codes
    # Crimes classified to less or more serious
    for i, serious_offense in enumerate([True, False]): 
        fbi_str = temp[0].iloc[3+i][1].replace('\xa0', ' ').replace('  ', ' ')
        fbi_list = pattern.findall(fbi_str)
        for desc, code in fbi_list:
            fbi_df = fbi_df.append(pd.DataFrame({'Code': [code[1:-1]], 'Description': [desc], 
                                                 'Serious offense': [serious_offense]}), ignore_index=True)
    fbi_df['Serious offense'] = fbi_df['Serious offense'].astype('bool')
    fbi_df.to_csv('fbi_codes.csv', index=False)
else:
    fbi_df = pd.read_csv('fbi_codes.csv', dtype={'Serious offense': 'bool'})
    
fbi_df.head()

Unnamed: 0,Code,Description,Serious offense
0,01A,Homicide 1st & 2nd Degree,True
1,02,Criminal Sexual Assault,True
2,03,Robbery,True
3,04A,Aggravated Assault,True
4,04B,Aggravated Battery,True
