### Crime in the time of Corona - Create DB from raw CSV tables

In [None]:
## Add dependencies: Pandas
import pandas as pd
import os # needed to use the os.path.join method to load the files
from sqlalchemy import create_engine # for integrating with PostgreSQL
from config import db_password

#### Incidents: Load raw csv, create dataframe and clean data.

In [None]:
# Load the CSV Files into a dataframes.
incidents_raw_df = pd.read_csv("../Resources - MPD Data/incidentTableClean.csv")

In [None]:
# Review the incidents table fields and counts of each field.
incidents_raw_df.info()
# 1788 records. Removed MP20009253 from CSV due to very incomplete record. 

In [None]:
# Review Number field for uniqueness. Its our primary field
incidents_raw_df.Number.value_counts(sort=True, dropna=True, ascending=False, bins=None)
# Results show 1660 unique values meaning 127 are possible duplicates.
# Looks like Case Number duplication is valid due to multiple offenses under same case.
# Need to create a primary unique field. 

In [None]:
# Create a primary unique field: Number plus offenseCode. 
# Call it CID for Case Id
incidents_raw_df["CID"] = incidents_raw_df["Number"] + ' - ' + incidents_raw_df["offenseCode"] 
incidents_raw_df.head(3)

In [None]:
# Check uniqueness of new combined field
incidents_raw_df.CID.value_counts(sort=True, dropna=True, ascending=False, bins=None) 
# Visual review of remaining double entries shows they are in fact duplicates.

In [None]:
# Drop the Duplicate Case IDs. Then check counts again.
incidents_raw_df.drop_duplicates(subset ="CID", keep = False, inplace = True)
incidents_raw_df.CID.value_counts(sort=True, ascending=False) 

In [None]:
# Set index to CID.  If it works, we have unique values for our CID field. woohoo!
incidents_raw_df.set_index('CID',inplace=True)
incidents_raw_df.head(3)

In [None]:
# How much data is left in our dataframe?
incidents_raw_df.info()
# Looks like 1778 rows, so we eliminated 10 records.  Lots of work for a unique primary key, but necessary.

In [None]:
## Ok. now we review and clean remaining fields in Incidents table. 
#  Start with the date field. Create a new date field stripped to date without time.
incidents_raw_df["dateIncident"] = pd.to_datetime(incidents_raw_df["dateReported"]).apply(lambda x: x.date())
incidents_raw_df.head(3)

In [None]:
## Create a new week field so we can do stats of crime types over time by week.
incidents_raw_df['weekNumber'] = pd.to_datetime(incidents_raw_df['dateReported']).dt.week

# This works, but since our data starts in 2019, we have five weeks with numbers over 40. Starting with Week 47 with blank week.
incidents_raw_df.weekNumber.value_counts(sort=True, dropna=True, ascending=False, bins=None)

In [None]:
## How to fix to make weekNumber column useful for linear regression analysis?
#  Add six to each week number so that the first six slots in the week order can be allocated to 2019
incidents_raw_df['weekNumber']= incidents_raw_df['weekNumber'] + 6

In [None]:
# Now replace the high values for 2019 weeks with proper lower numbers.
incidents_raw_df["weekNumber"].replace({53:1,54:2,55:3,56:4,57:5,58:6}, inplace=True)

In [None]:
# How does the weekNumber field tally look now? Sweet!
incidents_raw_df.weekNumber.value_counts(sort=True, ascending=False, bins=None)

#### Suspects: Load raw csv, create dataframe and clean data.

In [60]:
# Load the suspects csv into a dataframe.
suspects_raw_df = pd.read_csv("../Resources - MPD Data/suspectTable.csv")
suspects_raw_df.head(3)

Unnamed: 0,caseID,personID,Age,Sex
0,MP20010240,3877,16,MALE
1,MP20007353,2309,54,FEMALE
2,MP20010311,3570,23,MALE


In [None]:
# Review the suspects table structure
suspects_raw_df.info()

#### Victims: Load raw csv, create dataframe and clean data.

In [None]:
# Load the victims csv into a dataframe.
victims_raw_df = pd.read_csv("../Resources - MPD Data/victimTable.csv")

In [None]:
# Review the suspects table structure
victims_raw_df.info()

#### Penal Codes: Load raw csv, create dataframe and clean data.

In [None]:
## Now we look at our offense codes. 
incidents_raw_df.offenseCode.value_counts(sort=True, ascending=False, )
# Need a subject matter expert. This could be ok. Or not.

In [None]:
# Create the Penal Codes dataframe
#penal_codes_df = incidents_raw_df[["Penal_Code","Penal_Desc"]].copy()
penal_codes_df = incidents_raw_df[["offenseCode","offenseDescription"]].copy()
penal_codes_df.head(3)

In [None]:
# Rename the columns
penal_codes_df.rename(columns={"offenseCode": "Penal_Code", "offenseDescription": "Penal_Desc"}, inplace=True)
penal_codes_df.head()

In [None]:
# quick view
penal_codes_df.Penal_Code.value_counts(sort=True, ascending=False)

In [None]:
# Drop Duplicates
penal_codes_df.drop_duplicates(subset ="Penal_Code", keep = "first", inplace = True)
penal_codes_df.Penal_Code.value_counts()

In [None]:
#Reset Indesx
penal_codes_df.set_index('Penal_Code', inplace=True)

In [None]:
penal_codes_df.head()

### Export Files to CSV

In [None]:
# Crimes to CSV
cleaned_incidents = "../Resources/CleanedData/Cleaned_Incidents.csv"
incidents_raw_df.to_csv(cleaned_incidents, index=False)

In [61]:
# Suspects to CSV
output_suspects = "../Resources/CleanedData/Cleaned_Suspects.csv"
suspects_raw_df.to_csv(output_suspects, index=False)

In [62]:
# Victims to CSV
output_victims = "../Resources/CleanedData/Cleaned_Victims.csv"
victims_raw_df.to_csv(output_victims, index=False)

In [63]:
# Penal Codes to CSV
output_penal_codes = "../Resources/CleanedData/Penal_Codes.csv"
penal_codes_df.to_csv(output_penal_codes, index=False)