### Crime in the time of Corona - Create DB from raw CSV tables

In [None]:
## Add dependencies: Pandas
import pandas as pd
import os # needed to use the os.path.join method to load the files
from sqlalchemy import create_engine # for integrating with PostgreSQL
from config import db_password

#### Incidents: Load raw csv, create dataframe and clean data.

In [None]:
# Load the CSV Files into a dataframes.
incidents_raw_df = pd.read_csv("../Resources - MPD Data/incidentTableCopyClean.csv")

In [None]:
# Review the incidents table fields and counts of each field.
incidents_raw_df.info()
# 1788 records. Removed MP20009253 from CSV due to very incomplete record. 

In [None]:
# Review Number field for uniqueness. Its our primary field
incidents_raw_df.Number.value_counts(sort=True, dropna=True, ascending=False, bins=None)
# Results show 1661 unique values meaning 127 are possible duplicates.
# Looks like Case Number duplication is valid due to multiple offenses under same case.
# Need to create a primary unique field. 

In [None]:
# Create a primary unique field: Number plus offenseCode. 
# Call it CID for Case Id
incidents_raw_df["CID"] = incidents_raw_df["Number"] + ' - ' + incidents_raw_df["offenseCode"] 
incidents_raw_df.head(3)

In [None]:
# Check uniqueness of new combined field
incidents_raw_df.CID.value_counts(sort=True, dropna=True, ascending=False, bins=None) 
# Visual review of remaining double entries shows they are in fact duplicates.

In [None]:
# Drop the Duplicate Case IDs. Then check counts again.
incidents_raw_df.drop_duplicates(subset ="CID", keep = False, inplace = True)
incidents_raw_df.CID.value_counts(sort=True, ascending=False) 

In [None]:
# Set index to CID.  If it works, we have unique values for our CID field. woohoo!
incidents_raw_df.set_index('CID',inplace=True)
incidents_raw_df.head(3)

In [None]:
# How much data is left in our dataframe?
incidents_raw_df.info()
# Looks like 1778 rows, so we eliminated 10 records.  Lots of work for a unique primary key, but necessary.

In [None]:
## Ok. now we review and clean remaining fields in Incidents table. 

#  Start with the date field. Create a new date field stripped to date without time.

incidents_raw_df["dateIncident"] = pd.to_datetime(incidents_raw_df["dateReported"]).apply(lambda x: x.date())
incidents_raw_df.head(3)

In [None]:
## Create a new week field so we can do stats of crime types over time by week.
incidents_raw_df['weekNumber'] = pd.to_datetime(incidents_raw_df['dateReported']).dt.week

# This works, but since our data starts in 2019, we have five weeks with numbers over 40. Starting with Week 47 with blank week.
incidents_raw_df.weekNumber.value_counts(sort=True, dropna=True, ascending=False, bins=None)

In [None]:
## How to fix to make weekNumber column useful for linear regression analysis?
#  Add six to each week number so that the first six slots in the week order can be allocated to 2019
incidents_raw_df['weekNumber']= incidents_raw_df['weekNumber'] + 6

In [None]:
# Now replace the high values for 2019 weeks with proper lower numbers.
incidents_raw_df["weekNumber"].replace({53:1,54:2,55:3,56:4,57:5,58:6}, inplace=True)

In [None]:
# How does the weekNumber field tally look now? Sweet!
incidents_raw_df.weekNumber.value_counts(sort=True, ascending=False, bins=None)

In [None]:
## Now we look at our offense codes. 
incidents_raw_df.offenseCode.value_counts(sort=True, ascending=False)
# Need a subject matter expert. This could be ok. Or not.

#### FBI Crime National Crime Data, 1980 - 2018. Load and Clean

In [8]:
#### FBICrimes Load and Clean the 1980 to 2018 csv data into a dataframe
FBICrimes_df =  pd.read_csv("../Resources/est_crimes_1980_2018_FBI_UCRdata.csv")
FBICrimes_df.head(3)

Unnamed: 0,year,state_abbr,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault
0,1980,AK,440142,1919,39,250.0,,360,1270
1,1980,AL,3861466,17320,509,1158.0,,5102,10551
2,1980,AR,2284037,7656,210,609.0,,1848,4989


In [None]:
## Data looks clean. No obvious transformations needed. Ourarget of aggravated assault has a full row counts of 1974.
FBICrimes_df.info(3)
FBICrimes_df.describe()

#### National Unemployment Data,1980- 2018. Load and Clean.

In [13]:
#### Unemployment Data, 1980-2018. Load and Clean.
Unemployment_df =  pd.read_csv("../Resources/unemployment_by_state_1980_2018_BLSdata.csv")
Unemployment_df.head(3)

Unnamed: 0,Year,State,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Yr Avg
0,1980,AK,10.2,10.9,10.8,10.6,9.6,10.2,8.8,8.2,8.5,8.8,9.3,9.6,9.6
1,1981,AK,11.0,11.0,10.3,9.1,8.8,9.5,8.3,8.0,8.4,8.8,9.5,9.9,9.4
2,1982,AK,11.4,11.6,11.1,10.3,9.9,10.1,8.7,8.2,8.6,9.1,10.0,10.3,9.9


In [17]:
## Data looks clean. No obvious transformations needed. Our target of aggravated assault has a full row counts of 1974.
Unemployment_df.info(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1989 entries, 0 to 1988
Data columns (total 15 columns):
Year      1989 non-null int64
State     1989 non-null object
Jan       1989 non-null float64
Feb       1989 non-null float64
Mar       1989 non-null float64
Apr       1989 non-null float64
May       1989 non-null float64
Jun       1989 non-null float64
Jul       1989 non-null float64
Aug       1989 non-null float64
Sep       1989 non-null float64
Oct       1989 non-null float64
Nov       1989 non-null float64
Dec       1989 non-null float64
Yr Avg    1989 non-null float64
dtypes: float64(13), int64(1), object(1)
memory usage: 233.2+ KB


#### Suspects: Load raw csv, create dataframe and clean data.

In [19]:
# Load the suspects csv into a dataframe.
suspects_raw_df = pd.read_csv("../Resources - MPD Data/suspectTable.csv")
suspects_raw_df.head(3)

Unnamed: 0,caseID,personID,Age,Sex
0,MP20010240,3877,16,MALE
1,MP20007353,2309,54,FEMALE
2,MP20010311,3570,23,MALE


In [20]:
# Review the suspects table structure
suspects_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1878 entries, 0 to 1877
Data columns (total 4 columns):
caseID      1878 non-null object
personID    1878 non-null int64
Age         1017 non-null object
Sex         1003 non-null object
dtypes: int64(1), object(3)
memory usage: 58.8+ KB


#### Victims: Load raw csv, create dataframe and clean data.

In [21]:
# Load the victims csv into a dataframe.
victims_raw_df = pd.read_csv("../Resources - MPD Data/victimTable.csv")

In [22]:
# Review the suspects table structure
victims_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1738 entries, 0 to 1737
Data columns (total 4 columns):
caseID      1738 non-null object
personID    1738 non-null int64
Age         1738 non-null object
Sex         1738 non-null object
dtypes: int64(1), object(3)
memory usage: 54.4+ KB
