Data Exploration and Preprocessing

In [None]:
import pandas as pd
import numpy as np

from sqlalchemy import create_engine # for integrating with PostgreSQL

In [None]:
# Import dataset
file_path = "Resources/incidentTable.csv"
crime_df = pd.read_csv(file_path)
print(crime_df.shape)
crime_df.head(3)

In [None]:
# Remove Duplicates, if any
crime_df = crime_df.drop_duplicates()
print(crime_df.shape)

In [None]:
# Remove Null Values from offenseCode - We won't try to guess what these crimes might be
crime_df = crime_df.dropna(subset=["offenseCode"])
print(crime_df.shape)

In [None]:
# Convert date columns to datetime format
crime_df["dateReported"] = pd.to_datetime(crime_df["dateReported"])
crime_df["startDate"] = pd.to_datetime(crime_df["startDate"])
crime_df.head(3)

Write to Database

In [None]:
# Create the database engine
db_string = f"postgres://postgres:Yosemite1!@127.0.0.1:5432/crime_corona"

In [None]:
# Create the engine
engine = create_engine(db_string)

In [None]:
# Crimes to SQL
crime_df.to_sql(name='crimes', con=engine)

In [None]:
# Total number of crimes
crime_df.groupby('offenseCode').count()[['Number']]

In [None]:
# Group By Week
crime_df['weekNumber'] = pd.to_datetime(crime_df['dateReported']).dt.week
crime_df.head(3)

In [None]:
# Filtering out 2019 Data
start_date = "01-01-2020"
crime_df = crime_df.loc[(crime_df['startDate'] >= start_date)]
print(crime_df.shape)

In [None]:
# Create a crime count by week for linear regression -- All Crimes
crime_countweek_df = crime_df.groupby('weekNumber').count()[['Number']]
crime_countweek_df.head(3)

In [None]:
# Domestic Violence is 273.5 PC, create a dataframe for just 273.5 PC data
crime_dv_df = crime_df.loc[(crime_df['offenseCode'] == "273.5 (A) PC") | (crime_df['offenseCode'] == "273.5 (F)(1) PC") | (crime_df['offenseCode'] == "243 (E)(1) PC")]
print(crime_dv_df.shape)

In [None]:
# Create a crime count for DV by week for linear regression
crime_countweek_dv_df = crime_dv_df.groupby('weekNumber').count()[['Number']]
crime_countweek_dv_df.head(3)

In [None]:
# Write Crime Week Count and DV Week Count to csv for linear regression analysis
crime_count_output = "Resources/allCrime_count_wk.csv"
crime_countweek_df.to_csv(crime_count_output, index=True)

In [None]:
crime_dv_output = "Resources/dvCrime_count_wk.csv"
crime_countweek_dv_df.to_csv(crime_dv_output, index=True)

Create tables for Felony / Misdemeanor regression

In [None]:
# Create Felony Count by Week DF
felony_df = crime_df.loc[(crime_df['felonyMisdemeanor'] == "FELONY")]

# Create a felony count by week for linear regression
fel_weekCount_df = felony_df.groupby('weekNumber').count()[['Number']]
fel_weekCount_df.head(3)

In [None]:
# Print to CSV
fel_output = "Resources/fel_count_wk.csv"
fel_weekCount_df.to_csv(fel_output, index=True)

In [None]:
# Create Misd Count by Week DF
misd_df = crime_df.loc[(crime_df['felonyMisdemeanor'] == "MISDEMEANOR")]

# Create a Misd count by week for linear regression
misd_weekCount_df = misd_df.groupby('weekNumber').count()[['Number']]
misd_weekCount_df.head(3)

In [None]:
# Print to CSV
misd_output = "Resources/misd_count_wk.csv"
misd_weekCount_df.to_csv(misd_output, index=True)

Create a table for all aggravated assaults including Domestic Violence

In [None]:
agg_assault_df = crime_df[crime_df['offenseCode'].str.contains("24") | crime_df['offenseCode'].str.contains("273")]
agg_assault_df.groupby('offenseCode').count()[['Number']]

In [None]:
agg_assault_wk_df = agg_assault_df.groupby('weekNumber').count()[['Number']]

# Print to CSV
agg_assault_output = "Resources/aggAssault_count_wk.csv"
agg_assault_wk_df.to_csv(agg_assault_output, index=True)

Create Table for Robbery

In [None]:
# Robbery includes 211 PC, 212.5 (A) PC, 215(A) PC, create a dataframe for just Robbery data
crime_rob_df = crime_df.loc[(crime_df['offenseCode'] == "211 PC") | (crime_df['offenseCode'] == "212.5 (A) PC") | (crime_df['offenseCode'] == "215 (A) PC")]
print(crime_rob_df.shape)

In [None]:
# Group By Week Number and Print to CSV
crime_rob_df = crime_rob_df.groupby('weekNumber').count()[['Number']]

rob_output = "Resources/rob_count_wk.csv"
crime_rob_df.to_csv(rob_output, index=True)

In [None]:
crime_rob_df.head(3)

    Create a CSV that with an Arrest Column

In [None]:
crime_df.head(3)

In [None]:
crime_df["booked_alpha"] = crime_df["Booked"].apply(lambda x: "Booked" if x == 1 else "No Booking")
crime_df["DAComp_alpha"] = crime_df["DAComplaint"].apply(lambda x: "DA Complaint" if x == 1 else "No DA Complaint")
crime_df["cite_alpha"] = crime_df["Cited"].apply(lambda x: "Citation" if x == 1 else "No Citation")
crime_df.head(3)

In [None]:
crime_df = crime_df.drop(columns=["Booked", "DAComplaint", "Cited"])

In [None]:
dispo_output = "../Resources - MPD Data/Dashboard Files/MPDIncident_Dispo.csv"
crime_df.to_csv(dispo_output, index=True)