In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import collections

from wordcloud import WordCloud, STOPWORDS
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Crime_Reports_Austin.csv')

In [3]:
df.head()

Unnamed: 0,Incident Number,Highest Offense Description,Highest Offense Code,Family Violence,Occurred Date Time,Occurred Date,Occurred Time,Report Date Time,Report Date,Report Time,...,Y-coordinate,Latitude,Longitude,Location,Zip Codes,Single Member Council Districts,BOUNDARIES_single_member_districts,Zoning Review Cases_data,Neighborhood Planning Areas,Boundaries: City of Austin Council Districts
0,20121171927,RECKLESS DAMAGE,1401,N,04/26/2012 10:34:00 PM,04/26/2012,2234.0,04/26/2012 11:04:00 PM,04/26/2012,2304.0,...,,,,,,,,,,
1,2006471156,FAMILY DISTURBANCE,3400,N,02/16/2006 02:25:00 PM,02/16/2006,1425.0,02/16/2006 02:25:00 PM,02/16/2006,1425.0,...,,,,,,,,,,
2,20173300229,FAMILY DISTURBANCE,3400,N,11/26/2017 07:43:00 AM,11/26/2017,743.0,11/26/2017 07:43:00 AM,11/26/2017,743.0,...,,,,,,,,,,
3,20045044338,TAMPERING WITH ID NUMBER,2719,N,09/14/2004 03:32:00 PM,09/14/2004,1532.0,09/14/2004 03:32:00 PM,09/14/2004,1532.0,...,,,,,,,,,,
4,2006960811,FAMILY DISTURBANCE,3400,N,04/06/2006 10:29:00 AM,04/06/2006,1029.0,04/06/2006 10:29:00 AM,04/06/2006,1029.0,...,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2377481 entries, 0 to 2377480
Data columns (total 33 columns):
 #   Column                                        Dtype  
---  ------                                        -----  
 0   Incident Number                               int64  
 1   Highest Offense Description                   object 
 2   Highest Offense Code                          int64  
 3   Family Violence                               object 
 4   Occurred Date Time                            object 
 5   Occurred Date                                 object 
 6   Occurred Time                                 float64
 7   Report Date Time                              object 
 8   Report Date                                   object 
 9   Report Time                                   float64
 10  Location Type                                 object 
 11  Address                                       object 
 12  Zip Code                                      float64
 1

**Check for all the missing values and count them.**

In [5]:
df.isna().sum().sort_values(ascending=False)

Category Description                            1504295
UCR Category                                    1504295
Zoning Review Cases_data                        1139950
Neighborhood Planning Areas                      693396
Clearance Status                                 606136
Clearance Date                                   337704
Single Member Council Districts                   45688
BOUNDARIES_single_member_districts                43992
Boundaries: City of Austin Council Districts      43844
Council District                                  41985
Zip Codes                                         37606
Latitude                                          37525
Location                                          37525
Longitude                                         37525
Location Type                                     19007
Census Tract                                      17996
Zip Code                                           9922
PRA                                             

**Let's select only those columns that we need.**

In [6]:
column =['Incident Number', 
         'Highest Offense Code', 
         'Highest Offense Description', 
         'Occurred Date Time', 
         'Occurred Date', 
         'Address', 
         'Zip Code', 
         'Location Type', 
         'X-coordinate', 
         'Y-coordinate', 
         'Longitude', 
         'Latitude', 
         'Council District']

df1 = df.loc[:,column]
df1.head()

Unnamed: 0,Incident Number,Highest Offense Code,Highest Offense Description,Occurred Date Time,Occurred Date,Address,Zip Code,Location Type,X-coordinate,Y-coordinate,Longitude,Latitude,Council District
0,20121171927,1401,RECKLESS DAMAGE,04/26/2012 10:34:00 PM,04/26/2012,3154 HWY 71 E,,RESIDENCE / HOME,,,,,
1,2006471156,3400,FAMILY DISTURBANCE,02/16/2006 02:25:00 PM,02/16/2006,7000 DECKER 1422,,RESIDENCE / HOME,,,,,
2,20173300229,3400,FAMILY DISTURBANCE,11/26/2017 07:43:00 AM,11/26/2017,13204 LIPTON LP,,RESIDENCE / HOME,,,,,
3,20045044338,2719,TAMPERING WITH ID NUMBER,09/14/2004 03:32:00 PM,09/14/2004,3301 CR 100,,,,,,,
4,2006960811,3400,FAMILY DISTURBANCE,04/06/2006 10:29:00 AM,04/06/2006,5005 W FRANCES PL,,RESIDENCE / HOME,,,,,


In [7]:
df1.isna().sum().sort_values(ascending=False)

Council District               41985
Longitude                      37525
Latitude                       37525
Location Type                  19007
Zip Code                        9922
X-coordinate                    5969
Y-coordinate                    5962
Occurred Date Time               137
Address                           13
Incident Number                    0
Highest Offense Code               0
Highest Offense Description        0
Occurred Date                      0
dtype: int64

In [8]:
df1.shape

(2377481, 13)

Compared to the 2M rows, the missing rows seem insignificant.
We'll drop the rows with missing data.

In [9]:
df1.dropna(inplace=True)

In [10]:
df1.shape
print(round(2287541/2377481 *100,2))

96.22


We have retained around 96% data even after dropping rows with missing values.

Drop all the duplicated entries based on 'Incident Number'

In [11]:
df1.drop_duplicates(subset=['Incident Number'], inplace=True)

For the date comparison we need to change dtype of columns Occurred Date Time, Occurred Date from object to datetime.
Similarly, we are changing the data type of Zip Code and Council District into int.

In [12]:
df1['Zip Code'] =df1['Zip Code'].astype(int)
df1['Council District'] =df1['Council District'].astype(int)
df1['Occurred Date'] = pd.to_datetime(df1['Occurred Date'])  
df1['Occurred Date Time'] = pd.to_datetime(df1['Occurred Date Time']) 
# Y['Occurred Date'] = Y['Occurred Date'].dt.strftime('%m/%d/%Y')

Let's convert the uppercased data to camel-cased.

In [13]:
df1['Highest Offense Description'] = df1['Highest Offense Description'].str.title()
df1['Location Type'] = df1['Location Type'].str.title() 
df1['Address'] = df1['Address'].str.title()

Now, extracting the month, year, week and day of month from the date.

In [14]:
df1['Month'] = df1['Occurred Date Time'].dt.strftime('%B')
df1['Year'] = df1['Occurred Date Time'].dt.strftime('%Y')
df1['Week'] = df1['Occurred Date Time'].dt.day_name()
df1["DayOfMonth"] = df1['Occurred Date Time'].dt.day

In [15]:
df1.sample(5)

Unnamed: 0,Incident Number,Highest Offense Code,Highest Offense Description,Occurred Date Time,Occurred Date,Address,Zip Code,Location Type,X-coordinate,Y-coordinate,Longitude,Latitude,Council District,Month,Year,Week,DayOfMonth
1451727,2018111036,600,Theft,2018-01-11 16:12:00,2018-01-11,9300 S Ih 35 Svrd Sb,78748,Department / Discount Store,3100423.0,10033116.0,-97.78952,30.165488,5,January,2018,Thursday,11
449326,20043400525,1000,Forgery And Passing,2004-12-05 02:00:00,2004-12-05,9401 N Ih 35 Svrd Nb,78753,Service/ Gas Station,3131125.0,10103885.0,-97.687071,30.358051,4,December,2004,Sunday,5
1180625,20132131791,3401,Disturbance - Other,2013-08-01 21:29:00,2013-08-01,5203 Stone Gate Dr,78721,Residence / Home,3135260.0,10076707.0,-97.676022,30.283073,1,August,2013,Thursday,1
1047337,2017140673,3400,Family Disturbance,2017-01-14 11:59:00,2017-01-14,11000 Manchaca Rd,78748,Residence / Home,3086499.0,10031033.0,-97.833717,30.160629,5,January,2017,Saturday,14
557605,20193170066,3400,Family Disturbance,2019-11-13 01:14:00,2019-11-13,4912 Pepper Ln,78744,Residence / Home,3111905.0,10047072.0,-97.752173,30.203123,2,November,2019,Wednesday,13


Replacing same entries with different description by common description. 

In [65]:
mapping = {'Agg Assault':'Aggravated Assault',
           'Agg Assault Fam/Date Violence':'Aggravated Assault',
           'Burg Of Res - Fam/Dating Aslt':'Burglary Of Residence',
           'Att Burglary Of Residence':'Burglary Of Residence',
           'Theft-No Suspect/Followup':'Theft',
           'Burglary Of Veh-No Suspect/Fu':'Burglary Of Vehicle',
           'Assault W/Injury-Fam/Date Viol':'Assault With Injury',
           'Assault By Threat Fam/Dating': 'Assault By Threat',
           'Assault By Contact Fam/Dating':'Assault By Contact',
           'Forgery By Alteration':'Forgery And Passing',
           'Cred Card Abuse - Other': 'Debit/Credit Card Abuse',
           'Debit Card Abuse': 'Debit/Credit Card Abuse',
           'Criminal Mischief-No Suspect':'Criminal Mischief',
           'Poss Controlled Sub/Narcotic': 'Narcotic Possession',
           'Poss Controlled Sub/Syn Narc':'Narcotic Possession',
           'Poss Controlled Sub/Other':'Narcotic Possession',
           'Del Controlled Sub/Narcotic':'Narcotic Possession',
           'Del Controlled Sub/Syn Narc':'Narcotic Possession',
           'Del Controlled Sub/Other':'Narcotic Possession',
           'Dwi': 'DWI',
           'Dwi 2Nd': 'DWI',
           'Criminal Trespass/Transient':'Criminal Trespass',
           'Criminal Trespass/Hotel':'Criminal Trespass',
           'Terroristic Threat-Fam/Dat Vio':'Terroristic Threat',
           'Agg Robbery/Deadly Weapon': 'Aggravated Robbery',
           'Robbery By Assault': 'Aggravated Robbery',
           'Viol City Ordinance - Other':'Violation City Ordinance',
           'Poss Of Drug Paraphernalia':'Possession of Drug',
           'Family Disturbance/Parental':'Family Disturbance',
           'Agg Aslt Strangle/Suffocate':'Aggravated Assault','Dwi .15 Bac Or Above':'DWI'
          }


df1['Highest Offense Description'] = df1['Highest Offense Description'].replace(mapping)

Let's export the cleaned data to a new csv file.

In [20]:
df1.to_csv("crimedata_cleaned.csv", index=False)

In [21]:
df = pd.read_csv("crimedata_cleaned.csv")

In [22]:
df.head()

Unnamed: 0,Incident Number,Highest Offense Code,Highest Offense Description,Occurred Date Time,Occurred Date,Address,Zip Code,Location Type,X-coordinate,Y-coordinate,Longitude,Latitude,Council District,Month,Year,Week,DayOfMonth
0,20228022672,601,Burglary Of Vehicle,2022-12-01 11:30:00,2022-12-01,1921 Willow Creek Dr,78741,Residence / Home,3119469.0,10058001.0,-97.727425,30.23268,3,December,2022,Thursday,1
1,20225033150,504,Burglary Of Shed/Detached Garage/Storage Unit,2022-11-07 20:00:00,2022-11-07,4113 Avenue F,78751,Parking /Drop Lot/ Garage,3118205.0,10084145.0,-97.729489,30.304627,9,November,2022,Monday,7
2,20228022553,619,Theft Of Metal,2022-12-01 23:27:00,2022-12-01,300 W 6Th St,78701,Commercial / Office Building,3113404.0,10071112.0,-97.745659,30.26911,9,December,2022,Thursday,1
3,20228022692,2405,Doc Unreasonable Noise,2022-12-03 05:00:00,2022-12-03,11704 N Lamar Blvd,78758,Residence / Home,3131678.0,10114301.0,-97.684531,30.386647,7,December,2022,Saturday,3
4,20225031961,600,Theft,2022-10-29 23:00:00,2022-10-29,708 E 6Th St,78701,Residence / Home,3116608.0,10070123.0,-97.735585,30.266186,9,October,2022,Saturday,29


In [28]:
df['Longitude'] = df['Longitude'].astype(str)
df['Latitude'] = df['Latitude'].astype(str)

In [33]:
df['X-coordinate']=df['X-coordinate'].astype(int)
df['Y-coordinate']=df['Y-coordinate'].astype(int)

In [37]:
a = df[df.duplicated(['X-coordinate','Y-coordinate'],keep=False)].sort_values(by=['X-coordinate','Y-coordinate'])
a.head()

Unnamed: 0,Incident Number,Highest Offense Code,Highest Offense Description,Occurred Date Time,Occurred Date,Address,Zip Code,Location Type,X-coordinate,Y-coordinate,Longitude,Latitude,Council District,Month,Year,Week,DayOfMonth
152038,20205050355,1199,Fraud - Other,2020-11-27 12:00:00,2020-11-27,13009 On The Lake Rd,78732,Residence / Home,3054600,10091719,-97.93047886,30.32938286,6,November,2020,Friday,27
1243861,20065058050,2704,Terroristic Threat,2006-10-13 12:29:00,2006-10-13,13009 On The Lake Rd,78732,Residence / Home,3054600,10091719,-97.93047886,30.32938286,6,October,2006,Friday,13
68639,20175021917,600,Theft,2017-05-05 16:00:00,2017-05-05,12915 On The Lake Rd,78732,Dock / Wharf / Freight / Modal Terminal,3054861,10091468,-97.92966896,30.32867736,6,May,2017,Friday,5
489911,20033031019,2716,Criminal Trespass,2003-10-30 15:19:00,2003-10-30,12915 On The Lake Rd,78732,Residence / Home,3054861,10091468,-97.92966896,30.32867736,6,October,2003,Thursday,30
1123346,20035010393,500,Burglary Of Residence,2003-08-28 12:00:00,2003-08-28,12915 On The Lake Rd,78732,Residence / Home,3054900,10091633,-97.92953407,30.32912863,6,August,2003,Thursday,28


There is repetition of latitude and longitude, we can see multiple crimes happened at the same place over the time.

We are interested in the crimes that can cause bodily harm. And since there are more than 2 million rows, let's filter  crimes that ocurred over 10000 times.

In [102]:
filter = df.groupby(['Highest Offense Description']).size().reset_index().rename(columns={0:'Count'})
filter

Unnamed: 0,Highest Offense Description,Count
0,Abandoned Refrigerator,5
1,Abuse Of 911,135
2,Abuse Of Corpse,4
3,Abuse Of Official Capacity,23
4,Agg Aslt Enhanc Strangl/Suffoc,1023
...,...,...
365,Voco - Alcohol Consumption,9138
366,Voco Amplified Music/Vehicle,4756
367,Voco Solicitation Prohibit,6506
368,Warrant Arrest Non Traffic,40947


In [103]:
crime = filter[(filter.Count>=10000) & ~filter['Highest Offense Description'].isin(['Theft Of Bicycle', 
                                                                                    'Theft Of Service',
                                                                                    'Viol City Ordinance - Other',
                                                                                    'Warrant Arrest Non Traffic', 
                                                                                    'Theft By Shoplifting', 
                                                                                    'Dating Disturbance',
                                                                                    'Debit/Credit Card Abuse',
                                                                                    'Disturbance - Other',
                                                                                    'Family Disturbance',
                                                                                    'Forgery And Passing',
                                                                                    'Fraud - Other',
                                                                                    'Graffiti',
                                                                                    'Custody Arrest Traffic Warr',
                                                                                    'Identity Theft'])]
 
crime

Unnamed: 0,Highest Offense Description,Count
17,Aggravated Assault,25984
18,Aggravated Robbery,19313
31,Assault By Contact,46788
32,Assault By Threat,23570
37,Assault With Injury,134798
44,Auto Theft,53512
54,Burglary Non Residence,38762
56,Burglary Of Residence,81282
58,Burglary Of Vehicle,251464
85,Criminal Mischief,136269


These are the types of crimes that could cause bodily injury to the people visiting or staying in the neighborhood.