Data Exploration and Preprocessing

In [1]:
import pandas as pd

In [2]:
# Import dataset
file_path = "Resources/incidentTableClean.csv"
crime_df = pd.read_csv(file_path)
print(crime_df.shape)
crime_df.head(3)

(1778, 16)


Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor
0,MP19037568,"Dec 31, 2019, 9:36:26 AM","Dec 31, 2019, 9:30:00 AM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350,-121.033225,37.687181,1,0,0,,FELONY
1,MP19037545,"Dec 31, 2019, 2:50:00 AM","Dec 31, 2019, 2:50:00 AM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,1120 LILLIAN DR,MO,,95355,-120.931861,37.658732,0,0,0,,FELONY
2,MP19037559,"Dec 31, 2019, 8:50:00 AM","Dec 30, 2019, 8:00:00 PM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2516 STANDIFORD AVE,MO,,95350,-121.042052,37.685322,0,0,0,,FELONY


In [3]:
# Remove Duplicates, if any
crime_df = crime_df.drop_duplicates()
print(crime_df.shape)

(1775, 16)


In [4]:
# Remove Null Values from offenseCode - We won't try to guess what these crimes might be
crime_df = crime_df.dropna(subset=["offenseCode"])
print(crime_df.shape)

(1775, 16)


In [5]:
# Convert date columns to datetime format
crime_df["dateReported"] = pd.to_datetime(crime_df["dateReported"])
crime_df["startDate"] = pd.to_datetime(crime_df["startDate"])
crime_df.head(3)

Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor
0,MP19037568,2019-12-31 09:36:26,2019-12-31 09:30:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350,-121.033225,37.687181,1,0,0,,FELONY
1,MP19037545,2019-12-31 02:50:00,2019-12-31 02:50:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,1120 LILLIAN DR,MO,,95355,-120.931861,37.658732,0,0,0,,FELONY
2,MP19037559,2019-12-31 08:50:00,2019-12-30 20:00:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2516 STANDIFORD AVE,MO,,95350,-121.042052,37.685322,0,0,0,,FELONY


In [6]:
# Total number of crimes
crime_df.groupby('offenseCode').count()[['Number']]

Unnamed: 0_level_0,Number
offenseCode,Unnamed: 1_level_1
10851 VC,376
187 (A) PC,4
203 PC,1
211 PC,57
212.5 (A) PC,3
215 (A) PC,1
243 (D) PC,19
243.4 (A) PC,3
243.4 (D) PC,1
244 PC,1


In [7]:
# Group By Week
crime_df['weekNumber'] = pd.to_datetime(crime_df['dateReported']).dt.week
crime_df.head(3)

Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor,weekNumber
0,MP19037568,2019-12-31 09:36:26,2019-12-31 09:30:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350,-121.033225,37.687181,1,0,0,,FELONY,1
1,MP19037545,2019-12-31 02:50:00,2019-12-31 02:50:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,1120 LILLIAN DR,MO,,95355,-120.931861,37.658732,0,0,0,,FELONY,1
2,MP19037559,2019-12-31 08:50:00,2019-12-30 20:00:00,10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2516 STANDIFORD AVE,MO,,95350,-121.042052,37.685322,0,0,0,,FELONY,1


In [8]:
# Filtering out 2019 Data
start_date = "01-01-2020"
crime_df = crime_df.loc[(crime_df['startDate'] >= start_date)]
print(crime_df.shape)

(1689, 17)


In [9]:
# Create a crime count by week for linear regression
crime_countweek_df = crime_df.groupby('weekNumber').count()[['Number']]
crime_countweek_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1,79
2,85
3,115


In [10]:
# Domestic Violence is 273.5 PC, create a dataframe for just 273.5 PC data
crime_dv_df = crime_df.loc[(crime_df['offenseCode'] == "273.5 (A) PC") | (crime_df['offenseCode'] == "273.5 (F)(1) PC")]
print(crime_dv_df.shape)

(150, 17)


In [11]:
# Create a crime count for DV by week for linear regression
crime_countweek_dv_df = crime_dv_df.groupby('weekNumber').count()[['Number']]
crime_countweek_dv_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1,8
2,9
3,6


In [12]:
# Write Crime Week Count and DV Week Count to csv for linear regression analysis
crime_count_output = "Resources/allCrime_count_wk.csv"
crime_countweek_df.to_csv(crime_count_output, index=True)

In [13]:
crime_dv_output = "Resources/dvCrime_count_wk.csv"
crime_countweek_dv_df.to_csv(crime_dv_output, index=True)

Create tables for Felony / Misdemeanor regression

In [16]:
# Create Felony Count by Week DF
felony_df = crime_df.loc[(crime_df['felonyMisdemeanor'] == "FELONY")]

# Create a felony count by week for linear regression
fel_weekCount_df = felony_df.groupby('weekNumber').count()[['Number']]
fel_weekCount_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1,75
2,76
3,100


In [17]:
# Print to CSV
fel_output = "Resources/fel_count_wk.csv"
fel_weekCount_df.to_csv(fel_output, index=True)

In [18]:
# Create Misd Count by Week DF
misd_df = crime_df.loc[(crime_df['felonyMisdemeanor'] == "MISDEMEANOR")]

# Create a Misd count by week for linear regression
misd_weekCount_df = misd_df.groupby('weekNumber').count()[['Number']]
misd_weekCount_df.head(3)

Unnamed: 0_level_0,Number
weekNumber,Unnamed: 1_level_1
1,4
2,9
3,15


In [19]:
# Print to CSV
misd_output = "Resources/misd_count_wk.csv"
misd_weekCount_df.to_csv(misd_output, index=True)