# Crimes in Chicago 01/01/2023 - 05/11/2023

In [1]:
import pandas as pd

In [2]:
# Options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 15)

# Data Wrangling

## Crimes Data

In [3]:
df_crimes = pd.read_csv(r'Crimes_-_2023.csv')
df_crimes.sample(5)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
50676,13015894,JG192428,03/20/2023 02:37:00 PM,050XX S PRAIRIE AVE,031A,ROBBERY,ARMED - HANDGUN,SIDEWALK,False,False,224,2,3.0,38,03,1178892.0,1871731.0,2023,03/27/2023 04:58:01 PM,41.803327,-87.619441,"(41.803326653, -87.619440895)"
73593,13051674,JG234821,04/23/2023 08:40:00 PM,041XX W 66TH ST,1320,CRIMINAL DAMAGE,TO VEHICLE,DRIVEWAY - RESIDENTIAL,False,False,833,8,13.0,65,14,1149705.0,1860506.0,2023,04/30/2023 04:47:35 PM,41.773139,-87.726775,"(41.773138867, -87.726774789)"
75334,13052817,JG236266,04/25/2023 12:21:00 AM,052XX S HERMITAGE AVE,031A,ROBBERY,ARMED - HANDGUN,ALLEY,False,False,932,9,16.0,61,03,1165568.0,1869928.0,2023,05/02/2023 04:47:56 PM,41.798672,-87.668357,"(41.798672295, -87.668357467)"
9938,12953639,JG117670,01/15/2023 10:44:00 PM,034XX W CARROLL AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,1123,11,28.0,27,08B,1153330.0,1902186.0,2023,01/22/2023 03:50:37 PM,41.887443,-87.712382,"(41.887443217, -87.712381872)"
1107,12939661,JG101417,01/02/2023 08:50:00 AM,076XX S ST LAWRENCE AVE,0496,BATTERY,AGGRAVATED DOMESTIC BATTERY - KNIFE / CUTTING ...,APARTMENT,True,True,624,6,6.0,69,04B,1181557.0,1854523.0,2023,02/04/2023 03:46:13 PM,41.756045,-87.610198,"(41.756045113, -87.610197882)"


Let's change the columns to snekcase, so it will be readable and easy to access.

In [4]:
df_crimes.columns = df_crimes.columns.str.lower().str.replace(' ', '_')
df_crimes.columns

Index(['id', 'case_number', 'date', 'block', 'iucr', 'primary_type',
       'description', 'location_description', 'arrest', 'domestic', 'beat',
       'district', 'ward', 'community_area', 'fbi_code', 'x_coordinate',
       'y_coordinate', 'year', 'updated_on', 'latitude', 'longitude',
       'location'],
      dtype='object')

I only need these columns so I can answer the questions given to us
- id: ID of the person.
- iucr: four-digit codes that law enforcement agencies use to classify criminal incidents when taking individual reports.
- primary_type: the type of crime committed by a person.
- community_area: the ID of the area where the person resides.
- case_number: the case number assigned to the person's case.
- description: a detailed description of the crime.
- location_description: a description of the location where the crime occurred.

In [5]:
# Let's make a copy and filter the columns that I need and save it to CSV.
df_crimes_copy = df_crimes.copy()
df_crimes_copy = df_crimes_copy[['id', 'case_number', 'date', 'iucr', 'primary_type', 'description', 'location_description', 'community_area']]
df_crimes_copy.sample(5)

Unnamed: 0,id,case_number,date,iucr,primary_type,description,location_description,community_area
56316,13024107,JG202121,03/28/2023 12:59:00 PM,1811,NARCOTICS,POSSESS - CANNABIS 30 GRAMS OR LESS,STREET,15
86402,13070092,JG257097,05/11/2023 06:39:00 PM,520,ASSAULT,AGGRAVATED - KNIFE / CUTTING INSTRUMENT,APARTMENT,43
3174,12943505,JG105924,01/05/2023 09:00:00 PM,2014,NARCOTICS,MANUFACTURE / DELIVER - HEROIN (WHITE),SIDEWALK,23
54675,13021673,JG199212,03/25/2023 11:30:00 PM,610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,46
64853,13036923,JG217139,04/10/2023 01:45:00 AM,460,BATTERY,SIMPLE,APARTMENT,30


### Check for null values

#### Null values observation
The dataset contains null values in the "location_description" column. Upon analyzing the data, it is observed that the crimes with null location descriptions are classified as "DECEPTIVE PRACTICE" with the description "FINANCIAL IDENTITY THEFT". Furthermore, the "ARREST" column has only one unique value, which is False. Based on this information, it can be inferred that the NaN values in the "location_description" column should be replaced with "Unknown" since the exact location of the fraudulent activity cannot be determined.

Please note that this observation is based on the available data and assumptions made for missing information.

In [6]:
df_crimes_copy.isnull().sum()

id                        0
case_number               0
date                      0
iucr                      0
primary_type              0
description               0
location_description    384
community_area            0
dtype: int64

In [7]:
df_crimes_copy[df_crimes_copy.isnull().any(axis=1)].nunique()

id                      384
case_number             384
date                    376
iucr                      4
primary_type              1
description               4
location_description      0
community_area           67
dtype: int64

Let's check for the original df for more information

In [8]:
df_crimes[df_crimes.location_description.isnull()].sample(5)

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
34743,12997807,JG170144,02/17/2023 11:05:00 AM,004XX E WATERSIDE DR,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,114,1,42.0,32,11,1179704.0,1902210.0,2023,03/03/2023 03:47:46 PM,41.886945,-87.615529,"(41.886944528, -87.615528657)"
19372,12972118,JG139467,01/30/2023 10:32:00 AM,021XX S CALIFORNIA AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1023,10,12.0,30,11,1158023.0,1889721.0,2023,02/06/2023 03:49:33 PM,41.853144,-87.695488,"(41.853143528, -87.695487845)"
85204,13073928,JG261600,04/24/2023 06:05:00 PM,031XX N TROY ST,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,1411,14,35.0,21,11,1154839.0,1920465.0,2023,05/17/2023 04:48:11 PM,41.937572,-87.70635,"(41.937572263, -87.706349641)"
69436,13047560,JG229006,04/17/2023 10:25:00 PM,021XX W RANDOLPH ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1223,12,27.0,28,11,1162201.0,1901194.0,2023,04/24/2023 04:49:17 PM,41.88454,-87.679833,"(41.884540314, -87.67983259)"
34752,12992849,JG164301,02/24/2023 01:15:00 PM,004XX E OHIO ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1834,18,42.0,8,11,1179501.0,1904294.0,2023,03/03/2023 03:50:14 PM,41.892668,-87.61621,"(41.892667777, -87.616210158)"


In [9]:
df_crimes[df_crimes.location_description.isnull()].arrest.nunique()

1

In [10]:
df_crimes[df_crimes.location_description.isnull() & (df_crimes.arrest == True)].arrest.count()

0

In [11]:
# Change the NaN to unknown location
df_crimes_copy.location_description.fillna('UNKNOWN', inplace=True)

### Check for duplicates

#### Duplicates observation
I need to ensure that the "ID" and "case_number" columns do not have duplicates. While there are no duplicates in the "ID" column, I have identified 6 duplicates in the "case_number" column. Since the "case_number" should be unique to each person's case, I will remove the duplicates and retain the last recorded data for each case.

In [12]:
duplicates_per_column = df_crimes_copy.apply(lambda x: f"{x.duplicated().sum():,}", axis=0)
duplicates_per_column

id                           0
case_number                  6
date                    44,662
iucr                    86,333
primary_type            86,582
description             86,352
location_description    86,493
community_area          86,536
dtype: object

In [13]:
df_crimes_copy = df_crimes_copy.drop_duplicates(subset='case_number', keep='last')
duplicates_per_column = df_crimes_copy.apply(lambda x: f"{x.duplicated().sum():,}", axis=0)
duplicates_per_column

id                           0
case_number                  0
date                    44,659
iucr                    86,327
primary_type            86,576
description             86,346
location_description    86,487
community_area          86,530
dtype: object

Saving the new dataframe that we need to CSV and transfering it to SQL

In [14]:
df_crimes_copy.to_csv('./csv/chicago_crimes_2023.csv')

# Socio-Economics Data

In [15]:
df_economic = pd.read_csv(r'Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012.csv')
df_economic.columns = df_economic.columns.str.lower().str.replace(' ', '_')
df_economic.rename(columns={'per_capita_income_': 'per_capita_income'}, inplace=True)
df_economic.sample(5)

Unnamed: 0,community_area_number,community_area_name,percent_of_housing_crowded,percent_households_below_poverty,percent_aged_16+_unemployed,percent_aged_25+_without_high_school_diploma,percent_aged_under_18_or_over_64,per_capita_income,hardship_index
16,17.0,Dunning,5.2,10.6,10.0,16.2,33.6,26282,28.0
48,49.0,Roseland,2.5,19.8,20.3,16.9,41.2,17949,52.0
52,53.0,West Pullman,3.3,25.9,19.4,20.5,42.1,16563,62.0
72,73.0,Washington Height,1.1,16.9,20.8,13.7,42.6,19713,48.0
17,18.0,Montclaire,8.1,15.3,13.8,23.5,38.6,22014,50.0


In [16]:
df_economic.columns

Index(['community_area_number', 'community_area_name',
       'percent_of_housing_crowded', 'percent_households_below_poverty',
       'percent_aged_16+_unemployed',
       'percent_aged_25+_without_high_school_diploma',
       'percent_aged_under_18_or_over_64', 'per_capita_income',
       'hardship_index'],
      dtype='object')

I only need these columns in our socio-economic data so I can answer the questions given to us
- community_area_number: ID of the area.
- community_area_name: name of the community.
- per_capita_income: income per capita.
- percent_households_below_poverty: percentage of households below the poverty line.
- hardship_index: index of hardship.

In [17]:
df_economic_copy = df_economic.copy()
df_economic_copy = df_economic_copy[['community_area_number', 'community_area_name', 'percent_households_below_poverty', 'per_capita_income', 'hardship_index']]
df_economic_copy.sample(5)

Unnamed: 0,community_area_number,community_area_name,percent_households_below_poverty,per_capita_income,hardship_index
20,21.0,Avondale,15.3,20039,42.0
26,27.0,East Garfield Park,42.4,12961,83.0
71,72.0,Beverly,5.1,39523,12.0
5,6.0,Lake View,11.4,60058,5.0
36,37.0,Fuller Park,51.2,10432,97.0


### Check for null values

#### Null values observation
It appears that there is an error in the entry for the "Community Area Number" column. Chicago is not assigned a community area number; instead, it is divided into 77 distinct community areas. To address this issue, I will just remove the "Chicago" from "Community Area Number" column from the dataset.

In [18]:
df_economic_copy[df_economic_copy.isnull().any(axis=1)]

Unnamed: 0,community_area_number,community_area_name,percent_households_below_poverty,per_capita_income,hardship_index
77,,CHICAGO,19.7,28202,


In [19]:
df_economic_copy.community_area_number.describe().to_frame()

Unnamed: 0,community_area_number
count,77.0
mean,39.0
std,22.371857
min,1.0
25%,20.0
50%,39.0
75%,58.0
max,77.0


In [20]:
# Drop the NaN value
df_economic_copy.dropna(how='any', inplace=True)

In [21]:
df_economic_copy.isnull().sum()

community_area_number               0
community_area_name                 0
percent_households_below_poverty    0
per_capita_income                   0
hardship_index                      0
dtype: int64

### Check for duplicates

#### Duplicates observation
There are 10 duplicate values in the "percent_households_below_poverty" column. However, it is acceptable in this case because different communities can have the same percentage of households below the poverty line. Therefore, I will keep these duplicates as they are without any issues.

In [22]:
df_economic_copy.duplicated().sum()

0

In [23]:
duplicates_per_column = df_economic_copy.apply(lambda x: f"{x.duplicated().sum():,}", axis=0)
duplicates_per_column

community_area_number                0
community_area_name                  0
percent_households_below_poverty    10
per_capita_income                    0
hardship_index                       0
dtype: object

In [24]:
df_economic_copy[df_economic_copy.percent_households_below_poverty.duplicated(keep=False)].sort_values('percent_households_below_poverty')

Unnamed: 0,community_area_number,community_area_name,percent_households_below_poverty,per_capita_income,hardship_index
4,5.0,North Center,7.5,57123,6.0
11,12.0,Forest Glen,7.5,44164,11.0
74,75.0,Morgan Park,13.2,27149,30.0
12,13.0,North Park,13.2,26576,33.0
31,32.0,Loop,14.7,65526,3.0
...,...,...,...,...,...
0,1.0,Rogers Park,23.6,23939,39.0
68,69.0,Greater Grand Crossing,29.6,17285,66.0
34,35.0,Douglas,29.6,23791,47.0
29,30.0,South Lawndale,30.7,10402,96.0


Now we're finish let's save the new csv!

In [25]:
df_economic_copy.to_csv('./csv/chicago_economics.csv')