Clustering of Unemployment and Crime Data

In [1]:
import pandas as pd

In [2]:
# Import Data

# Create DataFrame from Unemployment Data
unemp_file_path = "Resources/unemployment_by_state_1980_2018_BLSdata.csv"
unemp_df = pd.read_csv(unemp_file_path)

# Create DataFrame from FBI UCR Crime data
crime_file_path = "Resources/est_crimes_1980_2018_FBI_UCRdata.csv"
fbi_crime_df = pd.read_csv(crime_file_path)

Unemployment EDA

In [3]:
unemp_df.head(3)

Unnamed: 0,Year,State,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Yr Avg
0,1980,AK,10.2,10.9,10.8,10.6,9.6,10.2,8.8,8.2,8.5,8.8,9.3,9.6,9.6
1,1981,AK,11.0,11.0,10.3,9.1,8.8,9.5,8.3,8.0,8.4,8.8,9.5,9.9,9.4
2,1982,AK,11.4,11.6,11.1,10.3,9.9,10.1,8.7,8.2,8.6,9.1,10.0,10.3,9.9


In [4]:
# Create a new DataFrame with columns Year, State, Average
unemp_df = unemp_df.drop(columns=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"])
unemp_df.head(3)

Unnamed: 0,Year,State,Yr Avg
0,1980,AK,9.6
1,1981,AK,9.4
2,1982,AK,9.9


In [5]:
# Combine Year / State into a single ID Column, drop Individual columns
unemp_df["Year"] = unemp_df["Year"].astype(str)
unemp_df["YrSt_ID"] = unemp_df["Year"] +" - "+ unemp_df["State"]
unemp_df = unemp_df.drop(columns=["Year","State"])

In [6]:
unemp_df = unemp_df[["YrSt_ID","Yr Avg"]]
unemp_df = unemp_df.rename(columns={"Yr Avg":"Unemp_Yr_Avg"})
unemp_df.head(3)

Unnamed: 0,YrSt_ID,Unemp_Yr_Avg
0,1980 - AK,9.6
1,1981 - AK,9.4
2,1982 - AK,9.9


In [7]:
unemp_df = unemp_df.set_index("YrSt_ID")
unemp_df.head(3)

Unnamed: 0_level_0,Unemp_Yr_Avg
YrSt_ID,Unnamed: 1_level_1
1980 - AK,9.6
1981 - AK,9.4
1982 - AK,9.9


FBI Crime EDA
    - Create two dataframes one for violent crime field, a second for just aggravated assault

In [8]:
fbi_crime_df.head(3)

Unnamed: 0,year,state_abbr,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault
0,1980,AK,440142,1919,39,250.0,,360,1270
1,1980,AL,3861466,17320,509,1158.0,,5102,10551
2,1980,AR,2284037,7656,210,609.0,,1848,4989


In [10]:
# Create a new DataFrame with year, state_abbr, population, violent_crime, aggravated_assault
fbi_crime_df = fbi_crime_df.drop(columns=["homicide","rape_legacy","rape_revised","robbery"])
fbi_crime_df.head(3)

Unnamed: 0,year,state_abbr,population,violent_crime,aggravated_assault
0,1980,AK,440142,1919,1270
1,1980,AL,3861466,17320,10551
2,1980,AR,2284037,7656,4989


In [11]:
# Combine Year / State into a single ID Column, drop Individual columns
fbi_crime_df["year"] = fbi_crime_df["year"].astype(str)
fbi_crime_df["YrSt_ID"] = fbi_crime_df["year"] +" - "+ fbi_crime_df["state_abbr"]
fbi_crime_df = fbi_crime_df.drop(columns=["year","state_abbr"])

In [12]:
fbi_crime_df = fbi_crime_df[["YrSt_ID","population", "violent_crime","aggravated_assault"]]
fbi_crime_df.head(3)

Unnamed: 0,YrSt_ID,population,violent_crime,aggravated_assault
0,1980 - AK,440142,1919,1270
1,1980 - AL,3861466,17320,10551
2,1980 - AR,2284037,7656,4989


In [13]:
fbi_crime_df = fbi_crime_df.set_index("YrSt_ID")
fbi_crime_df.head(3)

Unnamed: 0_level_0,population,violent_crime,aggravated_assault
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980 - AK,440142,1919,1270
1980 - AL,3861466,17320,10551
1980 - AR,2284037,7656,4989


Merge FBI Crime and Unemployement Datasets together

In [14]:
fbi_ue_df = fbi_crime_df.merge(unemp_df, left_index=True, right_index=True)
fbi_ue_df.head(3)

Unnamed: 0_level_0,population,violent_crime,aggravated_assault,Unemp_Yr_Avg
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980 - AK,440142,1919,1270,9.6
1980 - AL,3861466,17320,10551,8.9
1980 - AR,2284037,7656,4989,7.5


In [15]:
fbi_ue_df.dtypes

population              int64
violent_crime           int64
aggravated_assault      int64
Unemp_Yr_Avg          float64
dtype: object

In [16]:
for column in fbi_ue_df.columns:
    print(f"Column {column} has {fbi_ue_df[column].isnull().sum()} null values")

Column population has 0 null values
Column violent_crime has 0 null values
Column aggravated_assault has 0 null values
Column Unemp_Yr_Avg has 0 null values


In [17]:
print(f"Duplicate entries: {fbi_ue_df.duplicated().sum()}")

Duplicate entries: 0


In [18]:
# The Unemployment number is a percentage of the population that was unemployed in that state for that year>
# Create a new column that calculates the value based on population from FBI data to obtain the # of unemployed persons
fbi_ue_df["population_unemp"] = (fbi_ue_df["population"]*fbi_ue_df['Unemp_Yr_Avg'])/100
fbi_ue_df = fbi_ue_df.round({"population_unemp": 0})
fbi_ue_df.head(5)

Unnamed: 0_level_0,population,violent_crime,aggravated_assault,Unemp_Yr_Avg,population_unemp
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980 - AK,440142,1919,1270,9.6,42254.0
1980 - AL,3861466,17320,10551,8.9,343670.0
1980 - AR,2284037,7656,4989,7.5,171303.0
1980 - AZ,2715357,17673,10909,6.6,179214.0
1980 - CA,23532680,210290,102766,6.8,1600222.0


In [21]:
# Create dataframe and CSV for aggravated assault machine learning portion
aa_fbi_ue_df = fbi_ue_df.drop(columns=["violent_crime"])
aa_fbi_ue_df = aa_fbi_ue_df[["population", "population_unemp","aggravated_assault", "Unemp_Yr_Avg"]]
aa_fbi_ue_df.head(5)

#aa_fbi_ue_output = "Resources/aa_fbi_ue.csv"
#aa_fbi_ue_df.to_csv(aa_fbi_ue_output, index=True)

Unnamed: 0_level_0,population,population_unemp,aggravated_assault,Unemp_Yr_Avg
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980 - AK,440142,42254.0,1270,9.6
1980 - AL,3861466,343670.0,10551,8.9
1980 - AR,2284037,171303.0,4989,7.5
1980 - AZ,2715357,179214.0,10909,6.6
1980 - CA,23532680,1600222.0,102766,6.8


In [20]:
# Create dataframe and CSV for aggravated assault machine learning portion
vc_fbi_ue_df = fbi_ue_df.drop(columns=["aggravated_assault"])
vc_fbi_ue_df = vc_fbi_ue_df[["population", "population_unemp","violent_crime", "Unemp_Yr_Avg"]]
# vc_fbi_ue_df.head(5)

vc_fbi_ue_output = "Resources/vc_fbi_ue.csv"
vc_fbi_ue_df.to_csv(vc_fbi_ue_output, index=True)

Create a data frame for Robbery

In [9]:
# Create a new DataFrame with year, state_abbr, population, violent_crime, aggravated_assault
fbi_crime_df = fbi_crime_df.drop(columns=["homicide","rape_legacy","rape_revised","violent_crime", "aggravated_assault"])
fbi_crime_df.head(3)

Unnamed: 0,year,state_abbr,population,robbery
0,1980,AK,440142,360
1,1980,AL,3861466,5102
2,1980,AR,2284037,1848


In [10]:
# Combine Year / State into a single ID Column, drop Individual columns
fbi_crime_df["year"] = fbi_crime_df["year"].astype(str)
fbi_crime_df["YrSt_ID"] = fbi_crime_df["year"] +" - "+ fbi_crime_df["state_abbr"]
fbi_crime_df = fbi_crime_df.drop(columns=["year","state_abbr"])

In [13]:
fbi_crime_df = fbi_crime_df[["YrSt_ID","population", "robbery"]]
fbi_crime_df = fbi_crime_df.set_index("YrSt_ID")
fbi_crime_df.head(3)

Unnamed: 0_level_0,population,robbery
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1980 - AK,440142,360
1980 - AL,3861466,5102
1980 - AR,2284037,1848


In [14]:
fbi_ue_df = fbi_crime_df.merge(unemp_df, left_index=True, right_index=True)
fbi_ue_df.head(3)

Unnamed: 0_level_0,population,robbery,Unemp_Yr_Avg
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980 - AK,440142,360,9.6
1980 - AL,3861466,5102,8.9
1980 - AR,2284037,1848,7.5


In [16]:
# Create dataframe and CSV for aggravated assault machine learning portion
rob_fbi_ue_df = fbi_ue_df[["population", "robbery", "Unemp_Yr_Avg"]]
# vc_fbi_ue_df.head(5)

rob_fbi_ue_output = "Resources/rob_fbi_ue.csv"
rob_fbi_ue_df.to_csv(rob_fbi_ue_output, index=True)