### Crime in the time of Corona - Create DB from raw CSV tables

In [1]:
## Add dependencies: Pandas
import pandas as pd
import os # needed to use the os.path.join method to load the files
from sqlalchemy import create_engine # for integrating with PostgreSQL
from config import db_password

#### Incidents: Load raw csv, create dataframe and clean data.

In [2]:
# Load the incidentTable.csv into a dataframe.
incidents_raw_df = pd.read_csv("../Resources - MPD Data/incidentTableCopy.csv")

In [3]:
# Review the incidents table fields and counts of each field.
incidents_raw_df.info()
# 1788 records. Removed MP20009253 from CSV due to very incomplete record. 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1788 entries, 0 to 1787
Data columns (total 16 columns):
Number                1788 non-null object
dateReported          1788 non-null object
startDate             1788 non-null object
offenseCode           1788 non-null object
offenseDescription    1788 non-null object
streetAddress         1784 non-null object
cityDescription       1739 non-null object
stateDescription      1225 non-null object
zipCode               1517 non-null object
longitude             1764 non-null float64
latitude              1764 non-null float64
Booked                1788 non-null int64
DAComplaint           1788 non-null int64
Cited                 1788 non-null int64
burglaryFactor        350 non-null object
felonyMisdemeanor     1719 non-null object
dtypes: float64(2), int64(3), object(11)
memory usage: 223.6+ KB


In [4]:
# Review Number field for uniqueness. Its our primary field
incidents_raw_df.Number.value_counts(sort=True, dropna=True, ascending=False, bins=None)
# Results show 1661 unique values meaning 127 are possible duplicates.
# Looks like Case Number duplication is valid due to multipe offenses under same case.
# Need to create a primary unique field. 

MP20008752    5
MP20004466    4
MP20009588    4
MP20003358    3
MP20002933    3
             ..
MP20800196    1
MP20002645    1
MP20006556    1
MP20010096    1
MP20009186    1
Name: Number, Length: 1660, dtype: int64

In [5]:
# Create a primary unique field: Number plus offenseCode. 
# Call it CID for Case Id
incidents_raw_df["CID"] = incidents_raw_df["Number"] + ' - ' + incidents_raw_df["offenseCode"] 
incidents_raw_df.head(3)

Unnamed: 0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor,CID
0,MP19037614,"Dec 31, 2019, 7:10:57 PM","Dec 31, 2019, 7:00:00 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,OAKDALE RD & FLOYD AVE,MODESTO,CALIFORNIA,95355.0,-120.95777,37.67808,0,0,0,,FELONY,MP19037614 - 245 (A)(1) PC
1,MP19037565,"Dec 31, 2019, 9:01:31 AM","Dec 31, 2019, 12:30:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MO,CALIFORNIA,,-121.052988,37.699369,0,0,0,,FELONY,MP19037565 - 460(B) PC
2,MP19037555,"Dec 31, 2019, 8:12:40 AM","Dec 31, 2019, 5:00:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MODESTO,CALIFORNIA,95356.0,-121.053001,37.699374,0,0,0,,FELONY,MP19037555 - 460(B) PC


In [6]:
# Check uniqueness of new combined field
incidents_raw_df.CID.value_counts(sort=True, dropna=True, ascending=False, bins=None) 
# Visual review of remaining double entries are in fact duplicates.

MP20008677 - 460(B) PC        2
MP20009588 - 273.5 (A) PC     2
MP20008752 - 273 A(A) PC      2
MP20008752 - 187 (A) PC       2
MP20007431 - 245 (A)(2) PC    2
                             ..
MP20009110 - 273.5 (A) PC     1
MP20002541 - 245 (A)(1) PC    1
MP20001839 - 10851 VC         1
MP20010110 - 273.5 (A) PC     1
MP20800442 - 488 PC           1
Name: CID, Length: 1783, dtype: int64

In [7]:
# Drop the Duplicate Case IDs. Then check counts again.
incidents_raw_df.drop_duplicates(subset ="CID", keep = False, inplace = True)
incidents_raw_df.CID.value_counts(sort=True, ascending=False, bins=None) 

MP20002555 - 460(B) PC       1
MP20008132 - 10851 VC        1
MP20007508 - 10851 VC        1
MP20009098 - 460(B) PC       1
MP20004380 - 460(B) PC       1
                            ..
MP20001839 - 10851 VC        1
MP20010110 - 273.5 (A) PC    1
MP20000310 - 273.5 (A) PC    1
MP20006764 - 10851 VC        1
MP20800442 - 488 PC          1
Name: CID, Length: 1778, dtype: int64

In [8]:
# Set index to CID.  If it works, we have unique values for our CID field. woohoo!
incidents_raw_df.set_index('CID',inplace=True)
incidents_raw_df.head(3)

Unnamed: 0_level_0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
MP19037614 - 245 (A)(1) PC,MP19037614,"Dec 31, 2019, 7:10:57 PM","Dec 31, 2019, 7:00:00 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,OAKDALE RD & FLOYD AVE,MODESTO,CALIFORNIA,95355.0,-120.95777,37.67808,0,0,0,,FELONY
MP19037565 - 460(B) PC,MP19037565,"Dec 31, 2019, 9:01:31 AM","Dec 31, 2019, 12:30:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MO,CALIFORNIA,,-121.052988,37.699369,0,0,0,,FELONY
MP19037555 - 460(B) PC,MP19037555,"Dec 31, 2019, 8:12:40 AM","Dec 31, 2019, 5:00:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MODESTO,CALIFORNIA,95356.0,-121.053001,37.699374,0,0,0,,FELONY
MP19037568 - 460 (A) PC,MP19037568,"Dec 31, 2019, 9:36:26 AM","Dec 31, 2019, 9:30:00 AM",460 (A) PC,460 (A) PC BURGLARY:FIRST DEGREE 05 Burglary -...,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350.0,-121.033225,37.687181,1,0,0,RESIDENCE - DAY (6AM - 6PM),FELONY
MP19037568 - 10851 VC,MP19037568,"Dec 31, 2019, 9:36:26 AM","Dec 31, 2019, 9:30:00 AM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350.0,-121.033225,37.687181,1,0,0,,FELONY
MP19037447 - 245 (A)(1) PC,MP19037447,"Dec 29, 2019, 8:29:35 PM","Dec 29, 2019, 8:29:35 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,1604 PIMLICO DR,MODESTO,CALIFORNIA,95350.0,-121.019973,37.672487,0,0,0,,FELONY
MP19037447 - 245 (A)(4) PC,MP19037447,"Dec 29, 2019, 8:29:35 PM","Dec 29, 2019, 8:29:35 PM",245 (A)(4) PC,245 (A)(4) PC ASSAULT WITH DEADLY WEAPON: POS...,1604 PIMLICO DR,MODESTO,CALIFORNIA,95350.0,-121.019973,37.672487,0,0,0,,FELONY
MP20000042 - 460(B) PC,MP20000042,"Jan 1, 2020, 2:42:00 PM","Dec 31, 2019, 4:00:00 PM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,538 HIGH ST,MODESTO,CALIFORNIA,95354.0,-120.987617,37.652351,0,0,0,,FELONY
MP20000045 - 460(B) PC,MP20000045,"Jan 1, 2020, 1:51:55 PM","Jan 1, 2020, 1:45:00 PM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,1140 STODDARD AVE,MO,,95350.0,-121.009541,37.649288,0,0,0,,FELONY
MP20000035 - 460 (B) PC,MP20000035,"Jan 1, 2020, 11:32:05 AM","Jan 1, 2020, 5:55:00 AM",460 (B) PC,460 (B) PC BURGLARY:SECOND DEGREE: FELONY 05 B...,142 N 9TH ST,MO,,95350.0,-121.010206,37.64662,0,0,0,NON-RESIDENCE - NIGHT (6PM - 6AM),FELONY


In [9]:
incidents_raw_df.head(50)

Unnamed: 0_level_0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
MP19037614 - 245 (A)(1) PC,MP19037614,"Dec 31, 2019, 7:10:57 PM","Dec 31, 2019, 7:00:00 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,OAKDALE RD & FLOYD AVE,MODESTO,CALIFORNIA,95355.0,-120.95777,37.67808,0,0,0,,FELONY
MP19037565 - 460(B) PC,MP19037565,"Dec 31, 2019, 9:01:31 AM","Dec 31, 2019, 12:30:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MO,CALIFORNIA,,-121.052988,37.699369,0,0,0,,FELONY
MP19037555 - 460(B) PC,MP19037555,"Dec 31, 2019, 8:12:40 AM","Dec 31, 2019, 5:00:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MODESTO,CALIFORNIA,95356.0,-121.053001,37.699374,0,0,0,,FELONY
MP19037568 - 460 (A) PC,MP19037568,"Dec 31, 2019, 9:36:26 AM","Dec 31, 2019, 9:30:00 AM",460 (A) PC,460 (A) PC BURGLARY:FIRST DEGREE 05 Burglary -...,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350.0,-121.033225,37.687181,1,0,0,RESIDENCE - DAY (6AM - 6PM),FELONY
MP19037568 - 10851 VC,MP19037568,"Dec 31, 2019, 9:36:26 AM","Dec 31, 2019, 9:30:00 AM",10851 VC,10851 VC AUTO THEFT 07 Motor Vehicle Theft 44,2129 CLAYMONT DR,MODESTO,CALIFORNIA,95350.0,-121.033225,37.687181,1,0,0,,FELONY
MP19037447 - 245 (A)(1) PC,MP19037447,"Dec 29, 2019, 8:29:35 PM","Dec 29, 2019, 8:29:35 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,1604 PIMLICO DR,MODESTO,CALIFORNIA,95350.0,-121.019973,37.672487,0,0,0,,FELONY
MP19037447 - 245 (A)(4) PC,MP19037447,"Dec 29, 2019, 8:29:35 PM","Dec 29, 2019, 8:29:35 PM",245 (A)(4) PC,245 (A)(4) PC ASSAULT WITH DEADLY WEAPON: POS...,1604 PIMLICO DR,MODESTO,CALIFORNIA,95350.0,-121.019973,37.672487,0,0,0,,FELONY
MP20000042 - 460(B) PC,MP20000042,"Jan 1, 2020, 2:42:00 PM","Dec 31, 2019, 4:00:00 PM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,538 HIGH ST,MODESTO,CALIFORNIA,95354.0,-120.987617,37.652351,0,0,0,,FELONY
MP20000045 - 460(B) PC,MP20000045,"Jan 1, 2020, 1:51:55 PM","Jan 1, 2020, 1:45:00 PM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,1140 STODDARD AVE,MO,,95350.0,-121.009541,37.649288,0,0,0,,FELONY
MP20000035 - 460 (B) PC,MP20000035,"Jan 1, 2020, 11:32:05 AM","Jan 1, 2020, 5:55:00 AM",460 (B) PC,460 (B) PC BURGLARY:SECOND DEGREE: FELONY 05 B...,142 N 9TH ST,MO,,95350.0,-121.010206,37.64662,0,0,0,NON-RESIDENCE - NIGHT (6PM - 6AM),FELONY


In [10]:
# How many data is left if our dataframe?
incidents_raw_df.info()
# Looks like 1778 rows, so we eliminated 10 records.  Lots of work for a unique primary key, but necessary.

<class 'pandas.core.frame.DataFrame'>
Index: 1778 entries, MP19037614 - 245 (A)(1) PC to MP20010418 - 245 (A)(4) PC
Data columns (total 16 columns):
Number                1778 non-null object
dateReported          1778 non-null object
startDate             1778 non-null object
offenseCode           1778 non-null object
offenseDescription    1778 non-null object
streetAddress         1774 non-null object
cityDescription       1729 non-null object
stateDescription      1221 non-null object
zipCode               1509 non-null object
longitude             1754 non-null float64
latitude              1754 non-null float64
Booked                1778 non-null int64
DAComplaint           1778 non-null int64
Cited                 1778 non-null int64
burglaryFactor        350 non-null object
felonyMisdemeanor     1712 non-null object
dtypes: float64(2), int64(3), object(11)
memory usage: 236.1+ KB


In [11]:
## Ok. now we review and clean remaining fields in Incidents table. 

#  Start with the date field. Create a new date field stripped to date without time.
incidents_raw_df["dateIncident"] = pd.to_datetime(incidents_raw_df["dateReported"]).apply(lambda x: x.date())
incidents_raw_df.head(3)

Unnamed: 0_level_0,Number,dateReported,startDate,offenseCode,offenseDescription,streetAddress,cityDescription,stateDescription,zipCode,longitude,latitude,Booked,DAComplaint,Cited,burglaryFactor,felonyMisdemeanor,dateIncident
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
MP19037614 - 245 (A)(1) PC,MP19037614,"Dec 31, 2019, 7:10:57 PM","Dec 31, 2019, 7:00:00 PM",245 (A)(1) PC,245 (A)(1) PC FORCE OR ADW NOT FIREARM:GBI LIK...,OAKDALE RD & FLOYD AVE,MODESTO,CALIFORNIA,95355.0,-120.95777,37.67808,0,0,0,,FELONY,2019-12-31
MP19037565 - 460(B) PC,MP19037565,"Dec 31, 2019, 9:01:31 AM","Dec 31, 2019, 12:30:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MO,CALIFORNIA,,-121.052988,37.699369,0,0,0,,FELONY,2019-12-31
MP19037555 - 460(B) PC,MP19037555,"Dec 31, 2019, 8:12:40 AM","Dec 31, 2019, 5:00:00 AM",460(B) PC,460(B) PC BURGLARY: VEHICLE/LARCENY 06 Theft 2190,3120 PELANDALE AVE,MODESTO,CALIFORNIA,95356.0,-121.053001,37.699374,0,0,0,,FELONY,2019-12-31


In [12]:
## Create a new week field so we can do stats of crime types over time by week.
incidents_raw_df['weekNumber'] = pd.to_datetime(incidents_raw_df['dateReported']).dt.week

# This works, but since our data starts in 2019, we have five weeks with numbers over 40. Starting with Week 47 with blank week.
incidents_raw_df.weekNumber.value_counts(sort=True, dropna=True, ascending=False, bins=None)

13    140
4     135
12    134
10    127
8     125
11    122
3     122
9     119
1     111
5     110
15    106
7     104
6     100
14     94
2      93
16     19
52     10
51      3
50      2
47      1
49      1
Name: weekNumber, dtype: int64

In [13]:
## How to fix to make weekNumber column useful for linear regression analysis?
#  Add six to each week number so that the first six slots in the week order can be allocated to 2019
incidents_raw_df['weekNumber']= incidents_raw_df['weekNumber'] + 6

In [16]:
# Now replace the high values for 2019 weeks with proper lower numbers.
incidents_raw_df["weekNumber"].replace({53:1,54:2,55:3,56:4,57:5,58:6}, inplace=True)

In [17]:
# How does the weekNumber field tally look now?
incidents_raw_df.weekNumber.value_counts(sort=True, ascending=False, bins=None)

19    140
10    135
18    134
16    127
14    125
17    122
9     122
15    119
7     111
11    110
21    106
13    104
12    100
20     94
8      93
22     19
6      10
5       3
4       2
3       1
1       1
Name: weekNumber, dtype: int64

In [None]:
## Now we look at our offense codes. 
incidents_raw_df.offenseCode.value_counts(sort=True, ascending=False)
# Need a subject matter expert. This could be ok. Or not.

In [19]:
## Now we look at our Felony field
incidents_raw_df.felonyMisdemeanor.value_counts(sort=True, ascending=False, dropna=False)
# We have 66 with NaN.  Need to decide what to do. Some crimes can be classifed Felony or Misdemeanor, so..

FELONY         1521
MISDEMEANOR     191
NaN              66
Name: felonyMisdemeanor, dtype: int64

In [None]:
## Finally, we need to create a domestic violence field.
# We know that offenseCode starting with 273  means domestic violence. Anything else?

#### Suspects: Load raw csv, create dataframe and clean data.

In [None]:
# Load the suspects csv into a dataframe.
suspects_raw_df = pd.read_csv("../Resources - MPD Data/suspectTable.csv")

In [None]:
# Review the suspects table structure
suspects_raw_df.info()

In [None]:
# Review the suspects data
suspects_raw_df.head()

#### Victims: Load raw csv, create dataframe and clean data.

In [None]:
# Load the victims csv into a dataframe.
victims_raw_df = pd.read_csv("../Resources - MPD Data/victimTable.csv")

In [None]:
# Review the suspects table structure
victims_raw_df.info()

In [None]:
# Review the suspects data
victims_raw_df.head()