### Crime in the time of Corona - ETL for Combining FBI Crime and Employment Data into a single dataframe by Year and By State.


In [1]:
## Add dependencies: Pandas
import pandas as pd
import os # needed to use the os.path.join method to load the files
from sqlalchemy import create_engine # for integrating with PostgreSQL
from config import db_password

#### FBI Crime Data:  Load raw csv, create dataframe and clean data.

In [2]:
#### FBICrimes Load and Clean the 1980 to 2018 csv data into a dataframe
FBICrimesRaw_df =  pd.read_csv("../Resources/est_crimes_1980_2018_FBI_UCRdata.csv")
FBICrimesRaw_df.head(3)

Unnamed: 0,year,state_abbr,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault
0,1980,AK,440142,1919,39,250.0,,360,1270
1,1980,AL,3861466,17320,509,1158.0,,5102,10551
2,1980,AR,2284037,7656,210,609.0,,1848,4989


In [None]:
## Check for missing values. 
#  Found 1974 valuesin most columns, except rape_legacy and rape_revised columns. 
# No need to drop anything as we are mostly interested in aggravated_assualt

#FBICrimesRaw_df.count()    
#FBICrimesRaw_df.isnull()  # Confirms Nulls.
#FBICrimesRaw_df.isnull().sum() # Confirms with NULL count

In [None]:
## Review data types
#  All integers and floats which is good for stats and ML models. Except State with is text.
#  See no need for data conversions.
FBICrimesRaw_df.dtypes

### Create a Crimes by Year Dataframe. 

In [20]:
# FBICrimesByYear_df = FBICrimesRaw_df.groupby('year).sum()
CrimesByYear = FBICrimesRaw_df.groupby('year').sum().reset_index()
CrimesByYear_df=pd.DataFrame(CrimesByYear)
CrimesByYear_df.head(3)

Unnamed: 0,year,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault
0,1980,222877773,1335465,22679,82327.0,0.0,563581,666878
1,1981,226616000,1353533,22197,81788.0,0.0,590570,658978
2,1982,228984000,1314381,20654,78051.0,0.0,550976,664700


In [22]:
# more review
CrimesByYear_df.head(3)

Unnamed: 0,year,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault
0,1980,222877773,1335465,22679,82327.0,0.0,563581,666878
1,1981,226616000,1353533,22197,81788.0,0.0,590570,658978
2,1982,228984000,1314381,20654,78051.0,0.0,550976,664700


### Import the Employment Data

In [4]:
employment_df =  pd.read_csv("../Resources/unemployment_by_state_1980_2018_BLSdata.csv")
employment_df.head(3)

Unnamed: 0,Year,State,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Yr Avg
0,1980,AK,10.2,10.9,10.8,10.6,9.6,10.2,8.8,8.2,8.5,8.8,9.3,9.6,9.6
1,1981,AK,11.0,11.0,10.3,9.1,8.8,9.5,8.3,8.0,8.4,8.8,9.5,9.9,9.4
2,1982,AK,11.4,11.6,11.1,10.3,9.9,10.1,8.7,8.2,8.6,9.1,10.0,10.3,9.9


In [97]:
## Check for missing values. 
#  Found 1989 row counts. Nice tight complete database.
# employment_df.count()    
# employment_df.isnull()  # Confirms Nulls.
# employment_df.isnull().sum() # Confirms with NULL count

In [5]:
# Prep for merging. Restrict to year and YrAvg.
employment_yearly_df = employment_df[["Year", "Yr Avg"]].copy()
employment_yearly_df.head(3)

Unnamed: 0,Year,Yr Avg
0,1980,9.6
1,1981,9.4
2,1982,9.9


In [None]:
# Unemployment by Year Mean across all states
EmploymentYearly = employment_yearly_df.groupby('Year').mean().reset_index()
EmploymentYearly_df=pd.DataFrame(EmploymentYearly)
EmploymentYearly_df.head(50)

In [26]:
## Sweet way to get the dataframes to merge.  
#  For some reason, the normal method in the cell below did not work, so DeShan Yu offered this alternative method.
CrimesEmploymentByYear_df=CrimesByYear_df.copy()
CrimesEmploymentByYear_df["Yr Avg"]=EmploymentYearly_df["Yr Avg"]
CrimesEmploymentByYear_df.head()

Unnamed: 0,year,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault,Yr Avg
0,1980,222877773,1335465,22679,82327.0,0.0,563581,666878,6.811765
1,1981,226616000,1353533,22197,81788.0,0.0,590570,658978,7.32549
2,1982,228984000,1314381,20654,78051.0,0.0,550976,664700,9.217647
3,1983,231393000,1250396,19018,78275.0,0.0,504666,648437,9.162745
4,1984,233563000,1265468,18437,83487.0,0.0,483204,680340,7.309804


In [None]:
## Now we merge the dataframes..
#  Never figured out why this did not work. arggh....
#crimes_employment_df= pd.merge(CrimesByYear_df, EmploymentYearly_df, on=["year","Year"])
#crimes_employment_df.head(3)

In [27]:
## Export to a CSV.
crimes_employment = "../Resources/CleanedData/CrimeAndEmploymentByYear.csv"
CrimesEmploymentByYear_df.to_csv(crimes_employment, index=False)