### Crime in the time of Corona - Create DB from raw CSV tables

In [26]:
## Add dependencies: Pandas
import pandas as pd
import os # needed to use the os.path.join method to load the files
from sqlalchemy import create_engine # for integrating with PostgreSQL
from config import db_password

#### Incidents: Load raw csv, create dataframe and clean data.

In [28]:
# Load the CSV Files into a dataframes.
#incidents_raw_df = pd.read_csv("../Resources - MPD Data/incidentTableClean.csv")
clean_incidents_df = pd.read_csv("../Resources/CleanedData/Cleaned_Incidents.csv")
clean_incidents_df.head()

Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor,CID,dateIncident,weekNumber
0,MP19037614,"Dec 31, 2019, 7:10:57 PM","Dec 31, 2019, 7:00:00 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,OAKDALE RD & FLOYD AVE,MODESTO,CALIFORNIA,95355.0,-120.95777,37.67808,0,0,0,,FELONY,MP19037614 - 245 (A)(1) PC,2019-12-31,7
1,MP19037565,"Dec 31, 2019, 9:01:31 AM","Dec 31, 2019, 12:30:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MO,CALIFORNIA,,-121.052988,37.699369,0,0,0,,FELONY,MP19037565 - 460(B) PC,2019-12-31,7
2,MP19037555,"Dec 31, 2019, 8:12:40 AM","Dec 31, 2019, 5:00:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MODESTO,CALIFORNIA,95356.0,-121.053001,37.699374,0,0,0,,FELONY,MP19037555 - 460(B) PC,2019-12-31,7
3,MP19037568,"Dec 31, 2019, 9:36:26 AM","Dec 31, 2019, 9:30:00 AM",460 (A) PC,460 (A) PC BURGLARY:FIRST DEGREE 05 Burglary -...,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350.0,-121.033225,37.687181,1,0,0,RESIDENCE - DAY (6AM - 6PM),FELONY,MP19037568 - 460 (A) PC,2019-12-31,7
4,MP19037568,"Dec 31, 2019, 9:36:26 AM","Dec 31, 2019, 9:30:00 AM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350.0,-121.033225,37.687181,1,0,0,,FELONY,MP19037568 - 10851 VC,2019-12-31,7


In [8]:
# Review Number field for uniqueness. Its our primary field
incidents_raw_df.Number.value_counts(sort=True, dropna=True, ascending=False, bins=None)
# Results show 1660 unique values meaning 127 are possible duplicates.
# Looks like Case Number duplication is valid due to multiple offenses under same case.
# Need to create a primary unique field. 

MP20008752    5
MP20004466    4
MP20009588    4
MP20008931    3
MP20000095    3
             ..
MP20005344    1
MP20008113    1
MP20001834    1
MP20002734    1
MP20004179    1
Name: Number, Length: 1660, dtype: int64

In [10]:
# Create a primary unique field: Number plus offenseCode. 
# Call it CID for Case Id
incidents_raw_df["CID"] = incidents_raw_df["Number"] + ' - ' + incidents_raw_df["offenseCode"] 
incidents_raw_df.head(3)

Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor,CID
0,MP19037614,"Dec 31, 2019, 7:10:57 PM","Dec 31, 2019, 7:00:00 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,OAKDALE RD & FLOYD AVE,MODESTO,CALIFORNIA,95355.0,-120.95777,37.67808,0,0,0,,FELONY,MP19037614 - 245 (A)(1) PC
1,MP19037565,"Dec 31, 2019, 9:01:31 AM","Dec 31, 2019, 12:30:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MO,CALIFORNIA,,-121.052988,37.699369,0,0,0,,FELONY,MP19037565 - 460(B) PC
2,MP19037555,"Dec 31, 2019, 8:12:40 AM","Dec 31, 2019, 5:00:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MODESTO,CALIFORNIA,95356.0,-121.053001,37.699374,0,0,0,,FELONY,MP19037555 - 460(B) PC


In [11]:
# Check uniqueness of new combined field
incidents_raw_df.CID.value_counts(sort=True, dropna=True, ascending=False, bins=None) 
# Visual review of remaining double entries shows they are in fact duplicates.

MP20008752 - 187 (A) PC       2
MP20008677 - 460(B) PC        2
MP20007431 - 245 (A)(2) PC    2
MP20009588 - 273.5 (A) PC     2
MP20008752 - 273 A(A) PC      2
                             ..
MP20000640 - 273 A(A) PC      1
MP20000893 - 10851 VC         1
MP20008206 - 487 (A) PC       1
MP20004796 - 10851 VC         1
MP20002393 - 10851 VC         1
Name: CID, Length: 1783, dtype: int64

In [12]:
# Drop the Duplicate Case IDs. Then check counts again.
incidents_raw_df.drop_duplicates(subset ="CID", keep = False, inplace = True)
incidents_raw_df.CID.value_counts(sort=True, ascending=False) 

MP20001777 - 460(B) PC       1
MP20002250 - 460 (A) PC      1
MP20001130 - 488 PC          1
MP20800893 - 488 PC          1
MP20009623 - 488 PC          1
                            ..
MP20000893 - 10851 VC        1
MP20008206 - 487 (A) PC      1
MP20004796 - 10851 VC        1
MP20003644 - 273.5 (A) PC    1
MP20002393 - 10851 VC        1
Name: CID, Length: 1778, dtype: int64

In [None]:
# Set index to CID.  If it works, we have unique values for our CID field. woohoo!
incidents_raw_df.set_index('CID',inplace=True)
incidents_raw_df.head(3)

In [13]:
# How much data is left in our dataframe?
incidents_raw_df.info()
# Looks like 1778 rows, so we eliminated 10 records.  Lots of work for a unique primary key, but necessary.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1778 entries, 0 to 1787
Data columns (total 17 columns):
Number                1778 non-null object
dateReported          1778 non-null object
startDate             1778 non-null object
offenseCode           1778 non-null object
offenseDescription    1778 non-null object
streetAddress         1774 non-null object
cityDescription       1729 non-null object
stateDescription      1221 non-null object
zipCode               1509 non-null object
longitude             1754 non-null float64
latitude              1754 non-null float64
Booked                1778 non-null int64
DAComplaint           1778 non-null int64
Cited                 1778 non-null int64
burglaryFactor        350 non-null object
felonyMisdemeanor     1712 non-null object
CID                   1778 non-null object
dtypes: float64(2), int64(3), object(12)
memory usage: 250.0+ KB


In [14]:
## Ok. now we review and clean remaining fields in Incidents table. 
#  Start with the date field. Create a new date field stripped to date without time.
incidents_raw_df["dateIncident"] = pd.to_datetime(incidents_raw_df["dateReported"]).apply(lambda x: x.date())
incidents_raw_df.head(3)

Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor,CID,dateIncident
0,MP19037614,"Dec 31, 2019, 7:10:57 PM","Dec 31, 2019, 7:00:00 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,OAKDALE RD & FLOYD AVE,MODESTO,CALIFORNIA,95355.0,-120.95777,37.67808,0,0,0,,FELONY,MP19037614 - 245 (A)(1) PC,2019-12-31
1,MP19037565,"Dec 31, 2019, 9:01:31 AM","Dec 31, 2019, 12:30:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MO,CALIFORNIA,,-121.052988,37.699369,0,0,0,,FELONY,MP19037565 - 460(B) PC,2019-12-31
2,MP19037555,"Dec 31, 2019, 8:12:40 AM","Dec 31, 2019, 5:00:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MODESTO,CALIFORNIA,95356.0,-121.053001,37.699374,0,0,0,,FELONY,MP19037555 - 460(B) PC,2019-12-31


In [15]:
## Create a new week field so we can do stats of crime types over time by week.
incidents_raw_df['weekNumber'] = pd.to_datetime(incidents_raw_df['dateReported']).dt.week

# This works, but since our data starts in 2019, we have five weeks with numbers over 40. Starting with Week 47 with blank week.
incidents_raw_df.weekNumber.value_counts(sort=True, dropna=True, ascending=False, bins=None)

13    140
4     135
12    134
10    127
8     125
11    122
3     122
9     119
1     111
5     110
15    106
7     104
6     100
14     94
2      93
16     19
52     10
51      3
50      2
47      1
49      1
Name: weekNumber, dtype: int64

In [16]:
## How to fix to make weekNumber column useful for linear regression analysis?
#  Add six to each week number so that the first six slots in the week order can be allocated to 2019
incidents_raw_df['weekNumber']= incidents_raw_df['weekNumber'] + 6

In [17]:
# Now replace the high values for 2019 weeks with proper lower numbers.
incidents_raw_df["weekNumber"].replace({53:1,54:2,55:3,56:4,57:5,58:6}, inplace=True)

In [18]:
# How does the weekNumber field tally look now? Sweet!
incidents_raw_df.weekNumber.value_counts(sort=True, ascending=False, bins=None)

19    140
10    135
18    134
16    127
14    125
17    122
9     122
15    119
7     111
11    110
21    106
13    104
12    100
20     94
8      93
22     19
6      10
5       3
4       2
3       1
1       1
Name: weekNumber, dtype: int64

In [19]:
## Now we look at our offense codes. 
incidents_raw_df.offenseCode.value_counts(sort=True, ascending=False)
# Need a subject matter expert. This could be ok. Or not.

10851 VC             376
460(B) PC            308
488 PC               173
273.5 (A) PC         152
460 (B) PC           147
460 (A) PC           140
487 (A) PC            94
245 (A)(1) PC         74
211 PC                57
245 (A)(4) PC         54
273 A(A) PC           43
664 /460 (B) PC       20
243 (D) PC            19
261 (A)(2) PC         15
245 (A)(2) PC         14
484 (A) PC            12
451 (D) PC             9
664 /211 PC            8
664 /460 (A) PC        6
664 /187 (A) PC        6
487 (D)(2) PC          5
289 (A)(1) PC          4
261 (A)(1) PC          4
243.4 (A) PC           3
187 (A) PC             3
212.5 (A) PC           3
262 (A)(1) PC          2
288.5 (A) PC           2
463 (A) PC             2
664 /460 PC            1
261 (A)(3) PC          1
244 PC                 1
261 PC                 1
664 /212.5 (A) PC      1
288 A(C)(1) PC         1
286 (B)(1) PC          1
463 (C) PC             1
243.4 (D) PC           1
286 (C)(2)(A) PC       1
463 (B) PC             1


In [25]:
# Now export the cleaned Incidents table.

# Crimes to CSV
#cleaned_incidents = os.path.join("CleanData", "Cleaned_Incidents.csv")
cleaned_incidents = "../Resources/CleanedData/Cleaned_Incidents.csv"
incidents_raw_df.to_csv(cleaned_incidents, index=False)

#### FBI Crime Data: Load raw csv, create dataframe and clean data.

In [None]:
#### FBICrimes Load and Clean the 1980 to 2018 csv data into a dataframe
FBICrimes_df =  pd.read_csv("../Resources/est_crimes_1980_2018_FBI_UCRdata.csv")
FBICrimes_df.head(3)

In [None]:
## Data looks clean. No obvious transformations needed. Ourarget of aggravated assault has a full row counts of 1974.
FBICrimes_df.info(3)
FBICrimes_df.describe()

In [None]:
## Exploratory Data Analysis of FBI Crime Data.
for year in FBICrimes_df:
    




#### National Unemployment Data,1980- 2018. Load and Clean.

In [None]:
#### Unemployment Data, 1980-2018. Load and Clean.
Unemployment_df =  pd.read_csv("../Resources/unemployment_by_state_1980_2018_BLSdata.csv")
Unemployment_df.head(3)

In [None]:
## Data looks clean. No obvious transformations needed. Our target of aggravated assault has a full row counts of 1974.
Unemployment_df.info(3)

#### Suspects: Load raw csv, create dataframe and clean data.

In [None]:
# Load the suspects csv into a dataframe.
suspects_raw_df = pd.read_csv("../Resources - MPD Data/suspectTable.csv")
suspects_raw_df.head(3)

In [None]:
# Review the suspects table structure
suspects_raw_df.info()

#### Victims: Load raw csv, create dataframe and clean data.

In [None]:
# Load the victims csv into a dataframe.
victims_raw_df = pd.read_csv("../Resources - MPD Data/victimTable.csv")

In [None]:
# Review the suspects table structure
victims_raw_df.info()