Clustering Algorithms

In [135]:
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

In [136]:
# Import Data
file_path = Path("Resources/aa_fbi_ue.csv")
aa_fbi_ue_df = pd.read_csv(file_path, index_col=0)
aa_fbi_ue_df.head(3)

Unnamed: 0_level_0,population,population_unemp,aggravated_assault,Unemp_Yr_Avg
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980 - AK,440142,42254.0,1270,9.6
1980 - AL,3861466,343670.0,10551,8.9
1980 - AR,2284037,171303.0,4989,7.5


In [137]:
# Drop population fields
# aa_fbi_ue_df = aa_fbi_ue_df.drop(columns=["population_unemp"])
aa_fbi_ue_df["aa_percap"] = aa_fbi_ue_df['aggravated_assault'] / (aa_fbi_ue_df['population']) * 100
aa_fbi_ue_df.head(3)

Unnamed: 0_level_0,population,population_unemp,aggravated_assault,Unemp_Yr_Avg,aa_percap
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980 - AK,440142,42254.0,1270,9.6,0.288543
1980 - AL,3861466,343670.0,10551,8.9,0.273238
1980 - AR,2284037,171303.0,4989,7.5,0.218429


In [138]:
# Standardize Data
X = aa_fbi_ue_df.copy()
X = StandardScaler().fit_transform(X)
X[:5]

array([[-0.82082277, -0.67397026, -0.65340995,  1.76459072,  0.02197855],
       [-0.2604736 ,  0.00828211, -0.26998359,  1.42870739, -0.0711976 ],
       [-0.5188271 , -0.38186902, -0.49976674,  0.75694072, -0.40487223],
       [-0.44818492, -0.36396255, -0.25519352,  0.32509073,  0.71118593],
       [ 2.96130497,  2.85247613,  3.53969878,  0.42105739,  0.92391594]])

In [139]:
# Use Elbow Curve to find best value for k
inertia = []
k = list(range(1,11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    inertia.append(km.inertia_)
    
# Create the Elbow Curve using hvplot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



In [140]:
# Run K-Means with k=5
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(X)

# Predict Clusters
predictions = model.predict(X)

# Add Class to dataframe
aa_fbi_ue_df["Class"] = model.labels_
print(aa_fbi_ue_df.shape)
aa_fbi_ue_df.head(3)


(1974, 6)


Unnamed: 0_level_0,population,population_unemp,aggravated_assault,Unemp_Yr_Avg,aa_percap,Class
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980 - AK,440142,42254.0,1270,9.6,0.288543,4
1980 - AL,3861466,343670.0,10551,8.9,0.273238,4
1980 - AR,2284037,171303.0,4989,7.5,0.218429,4


In [141]:
aa_fbi_ue_df = aa_fbi_ue_df.reset_index()

In [142]:
aa_fbi_ue_df.hvplot.scatter(x="Unemp_Yr_Avg", y="aa_percap", by="Class")

In [143]:
fig = px.scatter_3d(
    aa_fbi_ue_df,
    x="Unemp_Yr_Avg",
    y="aa_percap",
    z="aggravated_assault",
    color="Class",
    symbol="Class",
    width=800,
    )
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

Logistic Regression
 - In this model we are using the population and aggravated assault numbers to classify whether unemployment is high (greater than 4.5), normal (between 3.5 and 4.5) and low (under 3.5)

In [144]:
# Import Logisitic Regression Dependencies
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

aa_fbi_ue_df.head(3)

Unnamed: 0,YrSt_ID,population,population_unemp,aggravated_assault,Unemp_Yr_Avg,aa_percap,Class
0,1980 - AK,440142,42254.0,1270,9.6,0.288543,4
1,1980 - AL,3861466,343670.0,10551,8.9,0.273238,4
2,1980 - AR,2284037,171303.0,4989,7.5,0.218429,4


In [145]:
aa_fbi_ue_df = aa_fbi_ue_df.drop(columns=["Class","YrSt_ID"])

In [146]:
aa_fbi_ue_df["unemp_cat"] = aa_fbi_ue_df["Unemp_Yr_Avg"].apply(lambda x: "Normal" if x <= 5.4 else "High")
aa_fbi_ue_df.head(3)

Unnamed: 0,population,population_unemp,aggravated_assault,Unemp_Yr_Avg,aa_percap,unemp_cat
0,440142,42254.0,1270,9.6,0.288543,High
1,3861466,343670.0,10551,8.9,0.273238,High
2,2284037,171303.0,4989,7.5,0.218429,High


In [147]:
aa_fbi_ue_df.groupby("unemp_cat").count()

Unnamed: 0_level_0,population,population_unemp,aggravated_assault,Unemp_Yr_Avg,aa_percap
unemp_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
High,1020,1020,1020,1020,1020
Normal,954,954,954,954,954


In [148]:
aa_fbi_ue_df = aa_fbi_ue_df.drop(columns=["Unemp_Yr_Avg","population_unemp"])

In [149]:
le = LabelEncoder()
aa_df = aa_fbi_ue_df.copy()
aa_df['unemp_cat'] = le.fit_transform(aa_df['unemp_cat'])
aa_df.head(3)

Unnamed: 0,population,aggravated_assault,aa_percap,unemp_cat
0,440142,1270,0.288543,0
1,3861466,10551,0.273238,0
2,2284037,4989,0.218429,0


In [150]:
aa_df.groupby("unemp_cat").count()

Unnamed: 0_level_0,population,aggravated_assault,aa_percap
unemp_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1020,1020,1020
1,954,954,954


In [151]:
# Define Features
X = aa_df.copy()
X = X.drop("unemp_cat", axis=1)

y = aa_df["unemp_cat"].values

In [152]:
# Split Train / Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [153]:
# StandardScaler
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [154]:
# Fit the model
model = tree.DecisionTreeClassifier()

# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [155]:
# Preditictions
predictions = model.predict(X_test_scaled)

In [156]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 0.7165991902834008
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.70      0.72       261
           1       0.69      0.73      0.71       233

    accuracy                           0.72       494
   macro avg       0.72      0.72      0.72       494
weighted avg       0.72      0.72      0.72       494



Logistic Regression - Violent Crimes

In [157]:
# Import Data
file_path = Path("Resources/vc_fbi_ue.csv")
vc_fbi_ue_df = pd.read_csv(file_path)
vc_fbi_ue_df.head(3)

Unnamed: 0,YrSt_ID,population,population_unemp,violent_crime,Unemp_Yr_Avg
0,1980 - AK,440142,42254.0,1919,9.6
1,1980 - AL,3861466,343670.0,17320,8.9
2,1980 - AR,2284037,171303.0,7656,7.5


In [158]:
vc_fbi_ue_df = vc_fbi_ue_df.drop(columns=["YrSt_ID", "population_unemp"])

In [159]:
vc_fbi_ue_df["unemp_cat"] = vc_fbi_ue_df["Unemp_Yr_Avg"].apply(lambda x: "Normal" if x <= 4.9 else "High")
vc_fbi_ue_df.head(3)

Unnamed: 0,population,violent_crime,Unemp_Yr_Avg,unemp_cat
0,440142,1919,9.6,High
1,3861466,17320,8.9,High
2,2284037,7656,7.5,High


In [160]:
vc_fbi_ue_df = vc_fbi_ue_df.drop(columns="Unemp_Yr_Avg")

In [161]:
vc_fbi_ue_df.groupby("unemp_cat").count()

Unnamed: 0_level_0,population,violent_crime
unemp_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
High,1254,1254
Normal,720,720


In [116]:
le = LabelEncoder()
vc_df = vc_fbi_ue_df.copy()
vc_df['unemp_cat'] = le.fit_transform(vc_df['unemp_cat'])
vc_df.head(3)

Unnamed: 0,population,violent_crime,unemp_cat
0,440142,1919,0
1,3861466,17320,0
2,2284037,7656,0


In [117]:
vc_df.groupby("unemp_cat").count()

Unnamed: 0_level_0,population,violent_crime
unemp_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1254,1254
1,720,720


In [118]:
# Define Features
X = vc_df.copy()
X = X.drop("unemp_cat", axis=1)

y = vc_df["unemp_cat"].values

In [119]:
# Split Train / Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [120]:
# StandardScaler
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [121]:
# Fit the model
model = tree.DecisionTreeClassifier()

# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [122]:
# Preditictions
predictions = model.predict(X_test_scaled)

In [123]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 0.7024291497975709
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.81      0.77       308
           1       0.63      0.52      0.57       186

    accuracy                           0.70       494
   macro avg       0.68      0.67      0.67       494
weighted avg       0.70      0.70      0.70       494



Logistic Regression on Robbery

In [124]:
# Import Data
file_path = Path("Resources/rob_fbi_ue.csv")
rob_fbi_ue_df = pd.read_csv(file_path)
rob_fbi_ue_df.head(3)

Unnamed: 0,YrSt_ID,population,robbery,Unemp_Yr_Avg
0,1980 - AK,440142,360,9.6
1,1980 - AL,3861466,5102,8.9
2,1980 - AR,2284037,1848,7.5


In [125]:
rob_fbi_ue_df = rob_fbi_ue_df.drop(columns=["YrSt_ID"])

In [126]:
rob_fbi_ue_df["unemp_cat"] = rob_fbi_ue_df["Unemp_Yr_Avg"].apply(lambda x: "Normal" if x <= 4.9 else "High")
rob_fbi_ue_df.head(3)

Unnamed: 0,population,robbery,Unemp_Yr_Avg,unemp_cat
0,440142,360,9.6,High
1,3861466,5102,8.9,High
2,2284037,1848,7.5,High


In [127]:
rob_fbi_ue_df = rob_fbi_ue_df.drop(columns="Unemp_Yr_Avg")

In [128]:
le = LabelEncoder()
rob_df = rob_fbi_ue_df.copy()
rob_df['unemp_cat'] = le.fit_transform(rob_df['unemp_cat'])
rob_df.head(3)

Unnamed: 0,population,robbery,unemp_cat
0,440142,360,0
1,3861466,5102,0
2,2284037,1848,0


In [129]:
# Define Features
X = vc_df.copy()
X = X.drop("unemp_cat", axis=1)

y = vc_df["unemp_cat"].values

In [130]:
# Split Train / Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [131]:
# StandardScaler
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [132]:
# Fit the model
model = tree.DecisionTreeClassifier()

# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [133]:
# Preditictions
predictions = model.predict(X_test_scaled)

In [134]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 0.7064777327935222
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.82      0.78       308
           1       0.64      0.52      0.57       186

    accuracy                           0.71       494
   macro avg       0.69      0.67      0.67       494
weighted avg       0.70      0.71      0.70       494

