# Exploratory Data Analysis for Crimes in Chicago in 2018



## Copy from s3

In [2]:
import pandas as pd
import boto3
import botocore

bucket = "sagemaker-chicago-data"
key = "Crimes_-_2018.csv"

s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file(key, "crimes_2018.csv")

In [4]:
df = pd.read_csv("../Data/crimes_2018.csv", index_col = "ID")

In [3]:
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,24170,JB429040,09/09/2018 10:30:00 PM,019XX E 74TH ST,110,HOMICIDE,FIRST DEGREE MURDER,STREET,False,False,...,8.0,43.0,01A,1190271.0,1856285.0,2018,09/16/2018 04:05:04 PM,41.760675,-87.578207,"(41.760674759, -87.578206773)"
1,11447764,JB437679,09/09/2018 12:00:00 PM,085XX W HIGGINS RD,1210,DECEPTIVE PRACTICE,THEFT OF LABOR/SERVICES,HOTEL/MOTEL,False,False,...,41.0,76.0,11,,,2018,09/16/2018 04:05:04 PM,,,
2,11446958,JB436362,08/07/2018 09:00:00 AM,017XX N CICERO AVE,1206,DECEPTIVE PRACTICE,"THEFT BY LESSEE,MOTOR VEH",OTHER,False,False,...,37.0,25.0,11,,,2018,09/16/2018 04:05:04 PM,,,
3,11446946,JB436563,09/09/2018 07:00:00 PM,014XX W WINONA ST,820,THEFT,$500 AND UNDER,VEHICLE NON-COMMERCIAL,False,False,...,46.0,3.0,06,,,2018,09/16/2018 04:05:04 PM,,,
4,11446937,JB436514,09/05/2018 12:55:00 PM,010XX N HERMITAGE AVE,820,THEFT,$500 AND UNDER,RESIDENCE,False,False,...,1.0,24.0,06,,,2018,09/16/2018 04:05:04 PM,,,


In [5]:
df.shape

(183121, 22)

In [10]:
df["Primary Type"].value_counts()

THEFT                                44086
BATTERY                              34903
CRIMINAL DAMAGE                      19436
ASSAULT                              14203
DECEPTIVE PRACTICE                   12366
OTHER OFFENSE                        11897
NARCOTICS                             8766
BURGLARY                              8027
MOTOR VEHICLE THEFT                   6823
ROBBERY                               6685
CRIMINAL TRESPASS                     4852
WEAPONS VIOLATION                     3815
OFFENSE INVOLVING CHILDREN            1558
CRIM SEXUAL ASSAULT                   1054
INTERFERENCE WITH PUBLIC OFFICER       947
PUBLIC PEACE VIOLATION                 932
SEX OFFENSE                            707
PROSTITUTION                           489
HOMICIDE                               396
ARSON                                  247
LIQUOR LAW VIOLATION                   192
GAMBLING                               165
STALKING                               138
KIDNAPPING 

## Analysis

Based on a few simple lines of code, we can conclude the following. From January through September of 2018, there were over 180,000 criminal events in Chicago. Specifically they can be broken down as following
- 122 kidnappings
- 1,054 sexual assaults
- 396 homicides
- 6,823 motor vehicle thefts
- 3,815 weapons violations
- 34,903 cases of battery
- 44,086 cases of theft
    
Each crime is recorded with 22 columns, described as below:

In [14]:
def print_row_headers(df):
    for h in list(df):
        print (h)
print_row_headers(df)

ID
Case Number
Date
Block
IUCR
Primary Type
Description
Location Description
Arrest
Domestic
Beat
District
Ward
Community Area
FBI Code
X Coordinate
Y Coordinate
Year
Updated On
Latitude
Longitude
Location


## For a criminal prediction project, we will only consider this data set the "Y", or the target variable.

That means we only need to keep records indicating that this crime occured. Let's drop everything else

In [5]:
keep_list = ["Case Number", "Date", "Block", "Primary Type", "Description", "Location Description", "Arrest", "Year", "Location"]

In [6]:
reduced_df = df[keep_list]

## Now let's remove the rows with missing values on those reduced columns

In [25]:
for h in list(reduced_df):
    print (h)
    print (df[h].isna().sum())

ID
0
Case Number
1
Date
0
Block
0
Primary Type
0
Description
0
Location Description
426
Arrest
0
Year
0
Location
935


It appears that we have 426 rows missing a location description, and 935 rows missing a location. Before we drop them, we need to make sure they are not correlated with our outcome variables, ie crime. 

In [37]:
missing_location = df.loc[ df["Location Description"].isna() > 0 ]

In [38]:
missing_location

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
80,11442350,JB430245,09/09/2018 05:20:00 PM,015XX N ST LOUIS AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,26.0,23.0,11,1152739.0,1910220.0,2018,09/16/2018 04:05:04 PM,41.909501,-87.714339,"(41.909501004, -87.714339348)"
378,11445264,JB433642,01/01/2018 10:25:00 AM,077XX S ESSEX AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,7.0,43.0,11,1194202.0,1854229.0,2018,09/16/2018 03:58:54 PM,41.754937,-87.563867,"(41.754937343, -87.56386716)"
380,11445255,JB434175,01/02/2018 01:55:00 PM,075XX S COLES AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,...,7.0,43.0,11,1195760.0,1855844.0,2018,09/16/2018 03:58:54 PM,41.759331,-87.558104,"(41.759330641, -87.558104318)"
719,11441353,JB428919,09/09/2018 03:35:00 PM,079XX S EVANS AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,6.0,44.0,11,1182602.0,1852541.0,2018,09/16/2018 04:05:04 PM,41.750582,-87.606430,"(41.750582129, -87.606429594)"
1327,11444396,JB432624,09/03/2018 01:10:00 PM,021XX W ERIE ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,26.0,24.0,11,1162047.0,1904321.0,2018,09/15/2018 03:55:51 PM,41.893124,-87.680311,"(41.893124278, -87.680310712)"
1329,11444388,JB432965,06/01/2018 12:00:00 AM,050XX N OTTAWA AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,41.0,10.0,11,1124248.0,1932665.0,2018,09/15/2018 03:55:51 PM,41.971609,-87.818510,"(41.971609414, -87.818509745)"
1330,11444386,JB432247,08/31/2018 10:05:00 AM,014XX W Foster Ave,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,48.0,77.0,11,1165635.0,1934633.0,2018,09/15/2018 03:55:51 PM,41.976226,-87.666267,"(41.976226339, -87.666267165)"
1654,11445257,JB433554,08/24/2018 05:40:00 AM,047XX S WOODLAWN AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,4.0,39.0,11,,,2018,09/14/2018 03:58:39 PM,,,
2368,11438598,JB424349,09/06/2018 03:49:00 AM,075XX N OCONTO AVE,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,,False,False,...,41.0,9.0,11,1127062.0,1949488.0,2018,09/13/2018 04:02:18 PM,42.017726,-87.807782,"(42.01772646, -87.807782401)"
2490,11443329,JB431628,08/24/2018 12:00:00 AM,072XX W SUMMERDALE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,41.0,10.0,11,1127207.0,1934716.0,2018,09/14/2018 03:53:41 PM,41.977188,-87.807583,"(41.977188264, -87.807582577)"


Most of the rows missing the location description appear to be about finacial crimes, ie financial identity theft. This indicates that if we want to build a prediction model for financial crimes, we would not be able to use the location description, becaause it is closely correlated with the outcome variable. Dropping it would introduce sample bias into our model.

For this demonstration we are only going to model the following criminal activities:
- Kidnapping
- Sexual Assault
- Homocide
- Moto Vehicle Theft
- Weapons Violations
- Battery
- Theft

Because Location Description is not correlated with any of these columns, we are good to drop the 496 rows that are missing Location Description. This will allow us to utilize the rest of the information contained in the Location Description column, without introducing bias into our model.

In [49]:
import numpy as np

df = df[ (df["Location Description"]).isna() == False ]

In [52]:
# If this returns a 0, then our row removal step was successful
df["Location Description"].isna().sum()

0

Moving on to the location column. Effectively we have 935 rows that are missing locations, and we need to decide if we will simply drop them. In order to make that decision, we need to know if they are correlated with the outcome variable, crime.

In [53]:
missing_geo = df.loc[ df["Location"].isna() > 0]

In [55]:
missing_geo["Primary Type"].value_counts()

THEFT                                544
DECEPTIVE PRACTICE                   132
CRIMINAL DAMAGE                       56
BATTERY                               34
OTHER OFFENSE                         23
ASSAULT                               19
MOTOR VEHICLE THEFT                   15
BURGLARY                              14
CRIMINAL TRESPASS                     10
ROBBERY                                8
OFFENSE INVOLVING CHILDREN             6
PUBLIC PEACE VIOLATION                 5
SEX OFFENSE                            3
PROSTITUTION                           2
NARCOTICS                              2
INTIMIDATION                           1
NON-CRIMINAL                           1
STALKING                               1
CRIM SEXUAL ASSAULT                    1
CONCEALED CARRY LICENSE VIOLATION      1
WEAPONS VIOLATION                      1
Name: Primary Type, dtype: int64

In [57]:
missing_geo["Description"].value_counts()

$500 AND UNDER                                    200
FROM BUILDING                                     139
OVER $500                                         128
CREDIT CARD FRAUD                                  66
POCKET-PICKING                                     54
TO VEHICLE                                         47
SIMPLE                                             33
TO PROPERTY                                        17
RETAIL THEFT                                       17
FRAUD OR CONFIDENCE GAME                           16
AUTOMOBILE                                         15
ILLEGAL USE CASH CARD                              14
DOMESTIC BATTERY SIMPLE                            14
HARASSMENT BY TELEPHONE                             8
FINANCIAL IDENTITY THEFT OVER $ 300                 8
UNLAWFUL ENTRY                                      7
THEFT BY LESSEE,MOTOR VEH                           7
PURSE-SNATCHING                                     6
TELEPHONE THREAT            

It appears that most of our 900 + rows with missing values for location are about theft under $500. That also happens to be our largest prediction category, with almost 17,000 records in that category. Given this magnitude, I am not concerned about introducing downward bias into the model against petty theft. We'll drop those rows as well.

In [58]:
df["Description"].value_counts()

SIMPLE                                                 20779
DOMESTIC BATTERY SIMPLE                                17027
$500 AND UNDER                                         16929
OVER $500                                              10065
TO VEHICLE                                              9778
TO PROPERTY                                             9215
RETAIL THEFT                                            7433
FROM BUILDING                                           7257
AUTOMOBILE                                              5990
FORCIBLE ENTRY                                          4557
AGGRAVATED: HANDGUN                                     3378
CREDIT CARD FRAUD                                       3248
UNLAWFUL ENTRY                                          2901
TO LAND                                                 2891
TELEPHONE THREAT                                        2808
UNLAWFUL POSS OF HANDGUN                                2619
ARMED: HANDGUN          

In [59]:
df = df[ (df["Location"]).isna() == False ]

In [60]:
# If this returns a 0, our removal was successful
df["Location"].isna().sum()

0

# Great! We've reduced our data set and removed the empty rows, let's write that to a csv.

In [62]:
df.to_csv("../Data/crimes_2018_reduced.csv")

Another very helpful step is wrapping all of these steps as a single Python function, so we can more easily use it later.

In [8]:
def main(f_name):
    
    df = pd.read_csv(f_name, index_col = "ID")
    
    # keep a subset of columns
    keep_list = ["Case Number", "Date", "Block", "Primary Type", "Description", "Location Description", "Arrest", "Year", "Location"]
    reduced_df = df[keep_list]

    # drop rows that are missing Location Description
    df = df[ (df["Location Description"]).isna() == False ]
    
    # drop rows that are missing Location, geo coordinates
    df = df[ (df["Location"]).isna() == False ]
    
    # write to disk
    df.to_csv("../Data/crimes_2018_reduced.csv")

main("../Data/crimes_2018.csv")   