In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import scipy.stats as st
from scipy.stats import linregress
import json
from datetime import datetime
import requests
import gmaps
import os
import seaborn as sns

# Import API key
from api_keys import g_key

# importing csv files.
crimes_19 = pd.read_csv('input_data/Chicago_Crimes_2019.csv')
crimes_20 = pd.read_csv('input_data/Chicago_Crimes_2020.csv')
crimes_21 = pd.read_csv('input_data/Chicago_Crimes_2021.csv')

# merging crimes data for the last 3 years into a signle dataframe
crimes_data = pd.merge((pd.merge(crimes_19,crimes_20, how="outer")),crimes_21,how="outer")

print(len(crimes_data))


In [None]:
# converting Date column in dataframe to date format and adding columns for month, year, day, time
crimes_data["Date"] = pd.to_datetime(crimes_data['Date'])
crimes_data["Day"] = crimes_data['Date'].dt.day_name()
crimes_data["Month Day"] =crimes_data['Date'].dt.day
crimes_data["Month"] = crimes_data['Date'].dt.month_name()
crimes_data["Year"] = crimes_data['Date'].dt.year
crimes_data["Time"] =crimes_data['Date'].dt.time
crimes_data["Hour"] =crimes_data['Date'].dt.strftime('%H').add(':00')
crimes_data["AM_PM"] = crimes_data['Date'].dt.strftime("%p")

#print the number of columns and number of rows 
print(crimes_data.shape)

# clense the dataframe by removing rows that have NaN values in any of the columns
crimes_data.dropna(how = 'any', inplace = True)

#print the number of columns and number of rows after clensing the data
print(crimes_data.shape)

# #temporarily reducing the size of the datafram to run it quicker
# crimes_data = crimes_data.head(10000)

crimes_data

In [None]:
# list all the columns titles
crimes_data.columns

## Section I

In this section we analyzed the types of crimes committed. We analyzed which crimes were most commonly committed and which crimes were least commonly committed. We analyzed what the trend of overall crimes committed was from 2019 to 2021

In [None]:
#sorting and counting types of crime
crime_type_overall = crimes_data.groupby(["Primary Type"])
overall_crime_type_df = pd.DataFrame(crimes_data["Primary Type"].value_counts())
overall_crime_type_df

In [None]:
#renaming two columns
overall_crime_type_df = overall_crime_type_df.rename(
    columns={"Primary Type": "Crime Count"})
overall_crime_type_df


In [None]:
#reduce crime count by creating lists for the top ten crimes
top_ten_crimes = ["Theft","Battery","Criminal Damage","Assault","Deceptive Practice","Other Offense","Motor Vehicale Theft","Narcotics","Burglary","Robbery"]
crime_count = [141158,131204,76153,59061,48198,42760,29354,26354,24951,23740]
x_axis = np.arange(len(crime_count))

In [None]:
#Plot the top ten crimes comitted in Chicago 
plt.bar(x_axis, crime_count, color="b", align="center")
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, top_ten_crimes, rotation="vertical")
plt.title("Top Ten Most Common Crimes over the Past Three Years")
plt.xlabel("Crime Categories")
plt.ylabel("Number of Criminal Acts Committed")

plt.show()
plt.savefig("Images/top_crimes_bar.png") 
plt.tight_layout()

In [None]:
#reduce crime count by creating lists for the least ten crimes
least_ten_crimes = ["Concealed Carry License Violation","Intimidation","Kidnapping","Gambling","Obscenity","Human Trafficking","Public Indecency","Other Narcotic Violation","Non-Criminal","Ritualism"]
least_crime_count = [534,441,377,180,156,30,24,14,8,1]
x_axis = np.arange(len(least_crime_count))

In [None]:
#plot the ten least committed crimes in Chicago
plt.bar(x_axis, least_crime_count, color="r", align="center")
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, least_ten_crimes, rotation="vertical")
plt.title("Least Common Crimes over the Past Three Years")
plt.xlabel("Crime Categories")
plt.ylabel("Number of Criminal Acts Committed")

plt.show()
 
plt.savefig("Images/least_crimes_bar.png") 
plt.tight_layout()

In [None]:
#Get the total number of crimes committed for the 3 years
number_crimes = crimes_data.groupby("Year")
yearly_crimes_df = pd.DataFrame(number_crimes["ID"].nunique())
yearly_crimes_df

In [None]:
#renaming the column ID
yearly_crimes_df = yearly_crimes_df.rename(
    columns={"ID": "Crime Count"})
yearly_crimes_df


In [None]:
#Plot total crimes committed each year in Chicago
x = [2019,2020,2021]
y = [258143,207427,202350]
labels = ['2019', '2020', '2021']
 
# Plotting x-axis and y-axis
plt.plot(x, y)
 
# naming of x-axis and y-axis
plt.xlabel("Year")
plt.ylabel("Number of Criminal Acts Committed")
 
# naming the title of the plot
plt.title("Yearly Reported Crime Totals")
 
# setting x-axis values
plt.xticks(x, labels)
plt.savefig("Images/yearly_crime_line.png") 
plt.show()


In [None]:
#using groupby do caluculate totals per different types of crimes
crime_type = crimes_data.groupby("Year")

crime_type_df = pd.DataFrame(crime_type["Primary Type"].value_counts())

crime_type_df


In [None]:
#renaming Primary Type column
crime_type_df = crime_type_df.rename(
    columns={"Primary Type": "Crime Count"})
crime_type_df


In [None]:
#using sum function 
crime_type_df["Crime Count"].sum()

In [None]:
#sorting data fram in ascending order
most_common_crimes = crime_type_df.sort_values(["Crime Count"],
                                           ascending=False)
most_common_crimes.head(30)

## Section II

# Total number of crimes and Arrests
1. Both total number of crimes and the total number of arrests in Chicago seem to be decreased for last three years.
2. The arrest percentage has also been decreased for last three years.
3. Most of the crimes did not result into arrests. In aggregate, only 17 % crimes resulted in Arrest.
4. Between 2020 and 2021, while total crimes did not change substantially, the number of arrests decreased considerably.

In [None]:
# separate data for each year  for crimes that resulted into arrest 
arrest19_df  = pd.DataFrame(crimes_data[(crimes_data["Year"]==2019) & (crimes_data["Arrest"]== True)])
arrest20_df  = pd.DataFrame(crimes_data[(crimes_data["Year"]==2020) & (crimes_data["Arrest"]== True)])
arrest21_df  = pd.DataFrame(crimes_data[(crimes_data["Year"]==2021) & (crimes_data["Arrest"]== True)])

arrest19_df.head()

In [None]:
# calculating total arrests and total crimes in each year
total_arrest_19 = len(arrest19_df["ID"])
total_arrest_20 = len(arrest20_df["ID"])
total_arrest_21 = len(arrest21_df["ID"])
total_crimes_19 = len(crimes_data[crimes_data["Year"]==2019]["ID"])
total_crimes_20 = len(crimes_data[crimes_data["Year"]==2020]["ID"])
total_crimes_21 = len(crimes_data[crimes_data["Year"]==2021]["ID"])

#print number of crimes and arrests each year
print(f" 019 Arrest = {total_arrest_19}, 020 Arrest = {total_arrest_20}, 021 Arrest = {total_arrest_21}")
print(f" 019 crime = {total_crimes_19}, 020 crime = {total_crimes_20}, 021 crime = {total_crimes_21}")


In [None]:
# Creating Bar plot for total crimes and arrests

#list total arrests for each year
Arrests = [total_arrest_19,total_arrest_20,total_arrest_21]

#list total crimes for each year
Total_crimes = [total_crimes_19,total_crimes_20,total_crimes_21]

#list of labels
labels = ["2019", "2020", "2021"]

x = np.arange(len(labels))

width = 0.3  #width of bar plot
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, Arrests, width, label='Total Arrests')
rects2 = ax.bar(x + width/2, Total_crimes, width, label='Total Crimes')

#Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Crimes')
ax.set_title('Crimes in Chicago')
ax.set_xticks(x, labels)
ax.legend()

ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)

fig.tight_layout()
plt.savefig("Images/chicago_crime_arrest.png")
plt.show()

In [None]:
# Creating Bar plot for total crimes and arrests

#define lists of variable values for bar plot
Arrests = [total_arrest_19,total_arrest_20,total_arrest_21]
Total_crimes = [total_crimes_19,total_crimes_20,total_crimes_21]

labels = ["2019", "2020", "2021"]
x = np.arange(len(labels))

width = 0.3  #width of bar plot
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, Arrests, width, label='Total Arrests')
rects2 = ax.bar(x + width/2, Total_crimes, width, label='Total Crimes')

#Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Crimes')
ax.set_title('Crimes in Chicago')
ax.set_xticks(x, labels)
ax.legend()

ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)

fig.tight_layout()
plt.savefig("Images/chicago_crime_arrest.png")
plt.show()


In [None]:
#Calculating Percentage arrest in each year
p_arrest2019 = 100*(total_arrest_19/total_crimes_19)
p_arrest2020 = 100*(total_arrest_20/total_crimes_20)
p_arrest2021 = 100*(total_arrest_21/total_crimes_21)

# calculating total percentage arrests in last three years
total_arrests = total_arrest_19 + total_arrest_20 + total_arrest_21  
total_crimes = total_crimes_19 + total_crimes_20 + total_crimes_20 # need to correct this in main
p_total_arrest = 100*(total_arrests/total_crimes)

#define labels list
labels = [2019, 2020, 2021, "Aggregate"]

#define list for variables values for bar plot
y = [p_arrest2019,p_arrest2020,p_arrest2021, p_total_arrest]

x = np.arange(len(labels))
fig, ax = plt.subplots()
width = 0.3
plot1 = ax.bar(x,y, width = 0.4, color = 'r')
ax.set_ylabel("% Arrest")
ax.set_title('Percentage arrest in Chicago')
ax.set_xticks(x, labels)
ax.bar_label(plot1)
plt.savefig("Images/Percentage_arrests.png")

In [None]:
# Domestic vs non domestic
#calculating total crimes that are domestic

#number of domestic crimes
domestic_crimes_no = len(crimes_data[crimes_data["Domestic"] == True]["ID"])

#no of total crimes
total_crimes_no = len(crimes_data["ID"])

#number of non-domestic crimes
non_domestic = total_crimes_no - domestic_crimes_no

In [None]:
#Pi plot to show share of domestic crimes to total crimes

labels = ["Domestic", "Non-domestic"]
sizes = [domestic_crimes_no, non_domestic ]

fig, ax = plt.subplots()
ax.pie(sizes, labels = labels, autopct='%1.1f%%')
plt.savefig("Images/Pie_Domestic_vs_nonDomestic.png")

## Chi-square test
1. Null hypothesis: "Total number of crimes/arrests in different years are same"
2. Alternate hypothesis: "Total number of crimes/arrests in different years are not same."

In [None]:
# Performing chi-square test for total crimes and arrests for past three years to see if the reduction in crimes and 
# are significant

#Observed values
crimes_observed = [total_crimes_19, total_crimes_20, total_crimes_21]
arrest_observed = [total_arrest_19-1, total_arrest_20, total_arrest_21] # subtracted 1 to make total arrests divisible by 3.

#Expected values
# for Null hypothesis: "The crimes data/arrest data are not significantly different", the expected values are same each year

#Expected crimes each year for null hypothesis are equal to average of three years

crimes_total = total_crimes_19 + total_crimes_20 +total_crimes_21  
crimes_expected = [crimes_total/3, crimes_total/3, crimes_total/3]

#Expected number of arrests in the same way
arrest_total = total_arrest_19-1 + total_arrest_20 + total_arrest_21 # subtracted 1 to make it divisible by 3 so that 
arrest_expected = [arrest_total/3, arrest_total/3, arrest_total/3]

#calculate critical value

# The degree of freedom is 3-1 = 2
# With a p-value of 0.05, the confidence level is 1.00-0.05 = 0.95.
critical_value = st.chi2.ppf(q = 0.95, df = 2)
critical_value
print(f"total crimes = {crimes_total}, total arrests = {arrest_total}")
# total_crimes/3
# arrest_total/3
#crimes_expected


In [None]:
#Running chi-square test for  'total crimes' 

chi_square = st.chisquare(crimes_observed, crimes_expected)
chi_square


In [None]:
#Running chi-square test for "total arrests"
chi_square = st.chisquare(arrest_observed, arrest_expected)
chi_square


## Chi_square conclusion
1. The chi_square values are much larger than the critical value for both number of crimes and number of arrests. So the change in number of crimes and number of arrests are statistically significant.

## Section III

In this section we analyzed what is the least dangerous and the most dangerous hour in Chicago based on the three-year period. Further, we wanted to have a visual representation of homicide and concealed carry violations near Loop & Millenium Park area.

In [None]:
# preparing the data to visualize time of the day when crime occurs
time_analysis = crimes_data[["ID", 'Hour']].groupby('Hour')['ID'].count()

#converting the series and then back to dataframe. Also converting midnight time to 24:00, then sorting the data.
time_analysis = time_analysis.to_frame().reset_index()
time_analysis = time_analysis.replace(to_replace =["00:00"], 
                            value ="24:00").sort_values(by=['Hour']).reset_index(drop=True)

# plotting the bar graph
time_analysis.plot(kind='bar', x="Hour", y="ID", xlabel="Time of the day in military format", 
                  ylabel ="Total Number of Crimes",label='Crimes Committed',figsize=(10,10))
plt.xticks(rotation=45)
# adding title
plt.title('Crimes per Time of the Day (3 years)')

plt.show()


In [None]:
#plotting locations of the crimes for the last 3 years that involve CONCEALED CARRY LICENSE VIOLATION. 
#we are interested in the loop area/millennium park area

#sorting the dataframe by MOTOR VEHICLE THEFT
vehicle_theft = crimes_data[(crimes_data["Primary Type"]=="CONCEALED CARRY LICENSE VIOLATION")]
                 
# Convert crime description to a list
crime_desc = vehicle_theft["Description"].tolist()

# Configure gmaps
gmaps.configure(api_key=g_key)

# group lats and lngs for each city location
marker_locations = vehicle_theft[["Latitude","Longitude"]]

# Create a marker layer using the crime description list to fill the info box
fig = gmaps.figure()
markers = gmaps.marker_layer(marker_locations,
    info_box_content=[f"Crime Description: {desc}" for desc in crime_desc])
fig.add_layer(markers)
fig

In [None]:
#plotting locations of the crimes for the last 3 years that involve HOMICIDES. 
#we are interested in the loop area/millennium park area

#sorting the dataframe by MOTOR VEHICLE THEFT
vehicle_theft = crimes_data[(crimes_data["Primary Type"]=="HOMICIDE")]
                 
# Convert crime description to a list
crime_desc = vehicle_theft["Description"].tolist()

# Configure gmaps
gmaps.configure(api_key=g_key)

# group lats and lngs for each city location
marker_locations = vehicle_theft[["Latitude","Longitude"]]

# Create a marker layer using the crime description list to fill the info box
fig = gmaps.figure()
markers = gmaps.marker_layer(marker_locations,
    info_box_content=[f"Crime Description: {desc}" for desc in crime_desc])
fig.add_layer(markers)
fig

## Section IV

We analyzed the total number of crimes per district from 2019 to 2021.The rough map of Chicago below makes it possible to visualize how Chicago is divided. The district with more crimes reported during this period was Jefferson Park, in the North Side of Chicago. Comparing the number of crimes in 2019 with 2021 in Jefferson Park district, the crimes decreased 39.9%

In [None]:
#total of crimes per year 
sns.countplot(x='Year',data=crimes_data)
plt.ylabel('No of Crimes')
plt.show()

In [None]:
# removing nan values and selecting range for y coordinates
cleandata = crimes_data.dropna()
df=cleandata[cleandata['Y Coordinate']>=1000000]
df

In [None]:
#Simulate the map of Chicago with the help of X and Y co-ordinates
sns.lmplot(x="X Coordinate",
           y="Y Coordinate",
           data=df,
           fit_reg=False,
           hue="District_name",
           palette='colorblind',
           height=5,
           scatter_kws={"marker": "o",
                        "s": 10})
ax = plt.gca()
ax.set_title("Map of Chicago\n", fontdict={'fontsize': 15}, weight="bold")
plt.show()

In [None]:
#total of crimes per district name from 2019 to 2021
temp=crimes_data.groupby('District_name')['ID'].count().sort_values(ascending=False)
temp=temp[:15]
temp

In [None]:
#Graphic describing the total of crimes per district from 2019 to 2021
temp.plot(kind='bar',color='green')
plt.ylabel('No of Crimes')
plt.show()

In [None]:
#Total of crime in 2019 per district 
grouped_district_df = crimes_data.loc[crimes_data['Year']==2019].groupby(["District_name"])

totalpopulation = grouped_district_df["District_population"].first()

totalcrimes2019 = grouped_district_df["ID"].count()

district_summary_2019 = pd.DataFrame({"District Population": totalpopulation,
                                   "Total Crimes 2019":totalcrimes2019})
                                   
district_summary_2019[["District Population", 
                     "Total Crimes 2019"]]


In [None]:
#Total of crime in 2020 per district 
grouped_district_df = crimes_data.loc[crimes_data['Year']==2020].groupby(["District_name"])

totalpopulation = grouped_district_df["District_population"].first()

totalcrimes2020 = grouped_district_df["ID"].count()

district_summary_2020 = pd.DataFrame({"District Population": totalpopulation,
                                   "Total Crimes 2020":totalcrimes2020})
                                   
district_summary_2020[["District Population", 
                     "Total Crimes 2020"]]

In [None]:
#Total of crime in 2021 per district 
grouped_district_df = crimes_data.loc[crimes_data['Year']==2021].groupby(["District_name"])

totalpopulation = grouped_district_df["District_population"].first()

totalcrimes2021 = grouped_district_df["ID"].count()



district_summary_2021 = pd.DataFrame({"District Population": totalpopulation,
                                   "Total Crimes 2021":totalcrimes2021})
                                   
district_summary_2021[["District Population", 
                     "Total Crimes 2021"]]

In [None]:
#Merged the total of crimes per year per district 
disctrict_summary_by_year = pd.merge((pd.merge(district_summary_2019["Total Crimes 2019"],\
                                     district_summary_2020["Total Crimes 2020"],\
                                     left_index=True, right_index=True)),district_summary_2021["Total Crimes 2021"],\
                                     left_index=True, right_index=True)

disctrict_summary_by_year

In [None]:
#Graphic describing the crimes per district by year 
crimes_data.groupby(['District_name','Year'])['ID'].count().unstack().plot(kind='bar')
plt.ylabel('No of crimes')
plt.show()

In [None]:
#location attributes - Type of crimes vs district
top = crimes_data.groupby(['District_name','Primary Type']).\
size().reset_index(name='counts').sort_values('counts',ascending=False).groupby('District_name').\
apply(lambda x: x.sort_values('counts',ascending=False).head(5))
# factor plot to make multiple plots
g =sns.catplot(x='Primary Type', y='counts', col="District_name", col_wrap=3,
                   data=top, kind='bar')
for ax in g.axes:
    ax.tick_params(labelbottom=True, labelrotation=45)
plt.subplots_adjust(hspace=0.4)