### Crime in the time of Corona - Create DB from raw CSV tables

In [None]:
## Add dependencies: Pandas
import pandas as pd
import os # needed to use the os.path.join method to load the files
from sqlalchemy import create_engine # for integrating with PostgreSQL
from config import db_password

#### Incidents: Load raw csv, create dataframe and clean data.

In [None]:
# Load the CSV Files into a dataframes.
incidents_raw_df = pd.read_csv("../Resources - MPD Data/incidentTableClean.csv")

In [None]:
# Review the incidents table fields and counts of each field.
incidents_raw_df.info()
# 2188 records. Removed MP20009253 from CSV due to very incomplete record. 

In [None]:
# Review Number field for uniqueness. Its our primary field
incidents_raw_df.Number.value_counts(sort=True, dropna=True, ascending=False, bins=None)
# Results show 1660 unique values meaning 127 are possible duplicates.
# Looks like Case Number duplication is valid due to multiple offenses under same case.
# Need to create a primary unique field. 



In [None]:
x=incidents_raw_df[incidents_raw_df["Number"] =='MP2001024070']
x.head()
                 
                   
 #                  passing_math = school_data_complete_df[school_data_complete_df["math_score"] >=70]
#passing_math.head()
    

In [None]:
# Create a primary unique field: Number plus offenseCode. 
# Call it CID for Case Id
incidents_raw_df["CID"] = incidents_raw_df["Number"] + ' - ' + incidents_raw_df["offenseCode"] 
incidents_raw_df.head(3)

In [None]:
# Check uniqueness of new combined field
incidents_raw_df.CID.value_counts(sort=True, dropna=True, ascending=False, bins=None) 
# Visual review of remaining double entries shows they are in fact duplicates.

In [None]:
# Drop the Duplicate Case IDs. Then check counts again.
incidents_raw_df.drop_duplicates(subset ="CID", keep = False, inplace = True)
incidents_raw_df.CID.value_counts(sort=True, ascending=False) 

In [None]:
# Reorder colummns
neworder = ['CID','Number','dateReported','startDate','offenseCode','offenseDescription','streetAddress','cityDescription','stateDescription','zipCode','longitude','latitude','Booked','DAComplaint','Cited','burglaryFactor','felonyMisdemeanor','dateIncident','weekNumber']
incidents_raw_df=incidents_raw_df.reindex(columns=neworder)
incidents_raw_df.head(3)

In [None]:
# How much data is left in our dataframe?
incidents_raw_df.info()
# 2186 rows. 

In [None]:
## Ok. now we review and clean remaining fields in Incidents table. 
#  Start with the date field. Create a new date field stripped to date without time.
incidents_raw_df["dateIncident"] = pd.to_datetime(incidents_raw_df["dateReported"]).apply(lambda x: x.date())
incidents_raw_df.head(3)

In [None]:
## Create a new week field so we can do stats of crime types over time by week.
incidents_raw_df['weekNumber'] = pd.to_datetime(incidents_raw_df['dateReported']).dt.week

# This works, but since our data starts in 2019, we have five weeks with numbers over 40. Starting with Week 47 with blank week.
incidents_raw_df.weekNumber.value_counts(sort=True, dropna=True, ascending=False, bins=None)

In [None]:
## How to fix to make weekNumber column useful for linear regression analysis?
#  Add six to each week number so that the first six slots in the week order can be allocated to 2019
incidents_raw_df['weekNumber']= incidents_raw_df['weekNumber'] + 6

In [None]:
# Now replace the high values for 2019 weeks with proper lower numbers.
incidents_raw_df["weekNumber"].replace({53:1,54:2,55:3,56:4,57:5,58:6}, inplace=True)

In [None]:
# How does the weekNumber field tally look now? Sweet!
incidents_raw_df.weekNumber.value_counts(sort=True, ascending=False, bins=None)

In [None]:
## final view before export of raw cleaned. 
#incidents_raw_df.reset_index(drop=True)
incidents_raw_df.head()
#df.reset_index(drop=True)

In [None]:
#Review
incidents_raw_df.info()

In [None]:
### Validate below.

In [None]:
## Create a list of unique cases for RDB
case_list_df= incidents_raw_df[["CID","Number"]].copy()
case_list_df.head()
case_list_df.info()

#### People -  Suspects and Victims: Load raw csv, create dataframes and clean data.

In [None]:
# Load the suspects csv into a dataframe.
involvement_raw_df = pd.read_csv("../Resources - MPD Data/involvementTable.csv")
involvement_raw_df.info()

In [None]:
# Drop where both Age and Sex are NULL
involvement_raw_df.dropna(subset=['age', 'sex'], inplace=True)
involvement_raw_df.info()

In [None]:
# Clean duplicates
involvement_raw_df.drop_duplicates()
involvement_raw_df.info()
involvement_raw_df.head()

In [None]:
# Filter to Core fields for RDB purposes
involvement_core_df = involvement_raw_df[['Number', 'involvementType', 'personID']].copy()
involvement_core_df.info()

In [None]:
# Drop Dupes
involvement_core_df=involvement_core_df.drop_duplicates()
involvement_core_df.info()

In [None]:
# Create a dataframe of just people.  Needed for RDB. 
people_df= involvement_raw_df[["personID","age","sex"]].copy()
people_df.info()

In [None]:
# Drop Dupes
people_df=people_df.drop_duplicates("personID")
people_df.info()

In [None]:
# Review
people_df.head()
people_df.info()


#### Penal Codes: Create dataframe and clean data.

In [None]:
## Now we look at our offense codes. 
incidents_raw_df.offenseCode.value_counts(sort=True, ascending=False, )
# Need a subject matter expert. This could be ok. Or not.

In [None]:
# Create the Penal Codes dataframe
#penal_codes_df = incidents_raw_df[["Penal_Code","Penal_Desc"]].copy()
penal_codes_df = incidents_raw_df[["offenseCode","offenseDescription"]].copy()
penal_codes_df.head(3)

In [None]:
# Rename the columns
penal_codes_df.rename(columns={"offenseCode": "Penal_Code", "offenseDescription": "Penal_Desc"}, inplace=True)
penal_codes_df.head()

In [None]:
# quick view
penal_codes_df.Penal_Code.value_counts(sort=True, ascending=False)

In [None]:
# Drop Duplicates
penal_codes_df.drop_duplicates(subset ="Penal_Code", keep = "first", inplace = True)
#people_df=people_df.drop_duplicates("personID")
penal_codes_df.Penal_Code.value_counts()

In [None]:
##Quick View
penal_codes_df.head()

### Export Files to CSV

In [None]:
# Crimes to CSV
cleaned_incidents = "../Resources/CleanedData/Cleaned_Incidents.csv"
incidents_raw_df.to_csv(cleaned_incidents, index=False)

In [None]:
# Unique cases to CSV
unique_cases = "../Resources/CleanedData/Unique_Cases.csv"
case_list_df.to_csv(unique_cases, index=False)

In [None]:
# People  to CSV
output_people = "../Resources/CleanedData/People.csv"
people_df.to_csv(output_people, index=False)

In [None]:
# Involvement Cleaned
output_cases_people = "../Resources/CleanedData/Cleaned_Involvement.csv"
involvement_core_df.to_csv(output_cases_people, index=False)

In [None]:
# Penal Codes to CSV
output_penal_codes = "../Resources/CleanedData/Penal_Codes.csv"
penal_codes_df.to_csv(output_penal_codes, index=False)

### Export dataframes to SQL tables in crime_corona db

In [None]:
# Create the database engine
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/crime_corona"

# Create the engine
engine = create_engine(db_string)

In [None]:
# Unique cases to SQL
case_list_df.to_sql(name='unique_cases', con=engine, if_exists='append',index=False)

In [None]:
# Penal Codes to SQL
penal_codes_df.to_sql(name='penal_codes', con=engine, if_exists='replace', index=False)

In [None]:
# Crimes list - cleaned original with all details to SQL
incidents_raw_df.to_sql(name='incidents', con=engine, if_exists='append',index=False)

In [None]:
# People to SQL
people_df.to_sql(name='people', con=engine, if_exists='append', index=False)

In [None]:
# Involvment - People & Cases to SQL
involvement_core_df.to_sql(name='case_people', con=engine, if_exists='append',index=False)