Data Exploration and Preprocessing

In [1]:
import pandas as pd

In [2]:
# Import dataset
file_path = "Resources/incidentTable.csv"
crime_df = pd.read_csv(file_path)
print(crime_df.shape)
crime_df.head(3)

(2189, 16)


Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor
0,MP20000019,"Jan 1, 2020, 8:52:18 AM","Jan 1, 2020, 8:52:18 AM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,920 PINE TREE LN,MODESTO,CALIFORNIA,95351.0,-121.0225978,37.61682379,0,0,0,,FELONY
1,MP20000040,"Jan 1, 2020, 2:11:00 PM","Jan 1, 2020, 11:25:00 AM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,901 N CARPENTER RD,MO,,,-121.0309447,37.64774644,0,0,0,,FELONY
2,MP20000062,"Jan 1, 2020, 3:28:00 PM","Jan 1, 2020, 3:20:00 PM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,1956 EL SERENO ST,MO,,95358.0,-121.0224092,37.60376174,0,0,0,,FELONY


In [3]:
# Remove Duplicates, if any
crime_df = crime_df.drop_duplicates()
print(crime_df.shape)

(2189, 16)


In [4]:
# Remove Null Values from offenseCode - We won't try to guess what these crimes might be
crime_df = crime_df.dropna(subset=["offenseCode"])
print(crime_df.shape)

(2189, 16)


In [5]:
# Convert date columns to datetime format
crime_df["dateReported"] = pd.to_datetime(crime_df["dateReported"])
crime_df["startDate"] = pd.to_datetime(crime_df["startDate"])
crime_df.head(3)

Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor
0,MP20000019,2020-01-01 08:52:18,2020-01-01 08:52:18,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,920 PINE TREE LN,MODESTO,CALIFORNIA,95351.0,-121.0225978,37.61682379,0,0,0,,FELONY
1,MP20000040,2020-01-01 14:11:00,2020-01-01 11:25:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,901 N CARPENTER RD,MO,,,-121.0309447,37.64774644,0,0,0,,FELONY
2,MP20000062,2020-01-01 15:28:00,2020-01-01 15:20:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,1956 EL SERENO ST,MO,,95358.0,-121.0224092,37.60376174,0,0,0,,FELONY


In [6]:
# Total number of crimes
crime_df.groupby('offenseCode').count()[['Number']]

Unnamed: 0_level_0,Number
offenseCode,Unnamed: 1_level_1
10851 VC,433
187 (A) PC,4
203 PC,1
211 PC,70
212.5 (A) PC,3
215 (A) PC,2
243 (D) PC,27
243 (E)(1) PC,222
243.3 PC,1
243.4 (A) PC,4


In [7]:
# Group By Week
crime_df['weekNumber'] = pd.to_datetime(crime_df['dateReported']).dt.week
crime_df.head(3)

Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor,weekNumber
0,MP20000019,2020-01-01 08:52:18,2020-01-01 08:52:18,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,920 PINE TREE LN,MODESTO,CALIFORNIA,95351.0,-121.0225978,37.61682379,0,0,0,,FELONY,1.0
1,MP20000040,2020-01-01 14:11:00,2020-01-01 11:25:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,901 N CARPENTER RD,MO,,,-121.0309447,37.64774644,0,0,0,,FELONY,1.0
2,MP20000062,2020-01-01 15:28:00,2020-01-01 15:20:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,1956 EL SERENO ST,MO,,95358.0,-121.0224092,37.60376174,0,0,0,,FELONY,1.0


In [8]:
# Filtering out 2019 Data
start_date = "01-01-2020"
crime_df = crime_df.loc[(crime_df['startDate'] >= start_date)]
print(crime_df.shape)

(2138, 17)


In [9]:
# Create a crime count by week for linear regression -- All Crimes
crime_countweek_df = crime_df.groupby('weekNumber').count()[['Number']]
crime_countweek_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1.0,80
2.0,91
3.0,123


In [10]:
# Domestic Violence is 273.5 PC, create a dataframe for just 273.5 PC data
crime_dv_df = crime_df.loc[(crime_df['offenseCode'] == "273.5 (A) PC") | (crime_df['offenseCode'] == "273.5 (F)(1) PC") | (crime_df['offenseCode'] == "243 (E)(1) PC")]
print(crime_dv_df.shape)

(395, 17)


In [11]:
# Create a crime count for DV by week for linear regression
crime_countweek_dv_df = crime_dv_df.groupby('weekNumber').count()[['Number']]
crime_countweek_dv_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1.0,16
2.0,17
3.0,17


In [12]:
# Write Crime Week Count and DV Week Count to csv for linear regression analysis
crime_count_output = "Resources/allCrime_count_wk.csv"
crime_countweek_df.to_csv(crime_count_output, index=True)

In [13]:
crime_dv_output = "Resources/dvCrime_count_wk.csv"
crime_countweek_dv_df.to_csv(crime_dv_output, index=True)

Create tables for Felony / Misdemeanor regression

In [14]:
# Create Felony Count by Week DF
felony_df = crime_df.loc[(crime_df['felonyMisdemeanor'] == "FELONY")]

# Create a felony count by week for linear regression
fel_weekCount_df = felony_df.groupby('weekNumber').count()[['Number']]
fel_weekCount_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1.0,69
2.0,74
3.0,98


In [15]:
# Print to CSV
fel_output = "Resources/fel_count_wk.csv"
fel_weekCount_df.to_csv(fel_output, index=True)

In [16]:
# Create Misd Count by Week DF
misd_df = crime_df.loc[(crime_df['felonyMisdemeanor'] == "MISDEMEANOR")]

# Create a Misd count by week for linear regression
misd_weekCount_df = misd_df.groupby('weekNumber').count()[['Number']]
misd_weekCount_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1.0,11
2.0,17
3.0,25


In [17]:
# Print to CSV
misd_output = "Resources/misd_count_wk.csv"
misd_weekCount_df.to_csv(misd_output, index=True)

Create a table for all aggravated assaults including Domestic Violence

In [18]:
agg_assault_df = crime_df[crime_df['offenseCode'].str.contains("24") | crime_df['offenseCode'].str.contains("273")]
agg_assault_df.groupby('offenseCode').count()[['Number']]

Unnamed: 0_level_0,Number
offenseCode,Unnamed: 1_level_1
243 (D) PC,27
243 (E)(1) PC,220
243.3 PC,1
243.4 (A) PC,3
243.4 (D) PC,1
244 PC,1
245 (A)(1) PC,83
245 (A)(2) PC,17
245 (A)(4) PC,60
273.5 (A) PC,174


In [19]:
agg_assault_wk_df = agg_assault_df.groupby('weekNumber').count()[['Number']]

# Print to CSV
agg_assault_output = "Resources/aggAssault_count_wk.csv"
agg_assault_wk_df.to_csv(agg_assault_output, index=True)

Create Table for Robbery

In [20]:
# Robbery includes 211 PC, 212.5 (A) PC, 215(A) PC, create a dataframe for just Robbery data
crime_rob_df = crime_df.loc[(crime_df['offenseCode'] == "211 PC") | (crime_df['offenseCode'] == "212.5 (A) PC") | (crime_df['offenseCode'] == "215 (A) PC")]
print(crime_rob_df.shape)

(75, 17)


In [23]:
# Group By Week Number and Print to CSV
crime_rob_df = crime_rob_df.groupby('weekNumber').count()[['Number']]

rob_output = "Resources/rob_count_wk.csv"
crime_rob_df.to_csv(rob_output, index=True)

In [24]:
crime_rob_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1.0,2
2.0,1
3.0,7
