Clustering Algorithms

In [46]:
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [47]:
# Import Data
file_path = Path("Resources/aa_fbi_ue.csv")
aa_fbi_ue_df = pd.read_csv(file_path, index_col=0)
aa_fbi_ue_df.head(3)

Unnamed: 0_level_0,population,population_unemp,aggravated_assault,Unemp_Yr_Avg
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980 - AK,440142,42254.0,1270,9.6
1980 - AL,3861466,343670.0,10551,8.9
1980 - AR,2284037,171303.0,4989,7.5


In [48]:
# Drop population fields
aa_fbi_ue_df = aa_fbi_ue_df.drop(columns=["population_unemp"])
aa_fbi_ue_df.head(3)

Unnamed: 0_level_0,population,aggravated_assault,Unemp_Yr_Avg
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980 - AK,440142,1270,9.6
1980 - AL,3861466,10551,8.9
1980 - AR,2284037,4989,7.5


In [49]:
# Standardize Data
X = aa_fbi_ue_df.copy()
X = StandardScaler().fit_transform(X)
X[:5]

array([[-0.82082277, -0.65340995,  1.76459072],
       [-0.2604736 , -0.26998359,  1.42870739],
       [-0.5188271 , -0.49976674,  0.75694072],
       [-0.44818492, -0.25519352,  0.32509073],
       [ 2.96130497,  3.53969878,  0.42105739]])

In [50]:
# Use Elbow Curve to find best value for k
inertia = []
k = list(range(1,11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    inertia.append(km.inertia_)
    
# Create the Elbow Curve using hvplot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



In [51]:
# Run K-Means with k=5
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(X)

# Predict Clusters
predictions = model.predict(X)

# Add Class to dataframe
aa_fbi_ue_df["Class"] = model.labels_
print(aa_fbi_ue_df.shape)
aa_fbi_ue_df.head(3)


(1974, 4)


Unnamed: 0_level_0,population,aggravated_assault,Unemp_Yr_Avg,Class
YrSt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980 - AK,440142,1270,9.6,0
1980 - AL,3861466,10551,8.9,0
1980 - AR,2284037,4989,7.5,0


In [52]:
aa_fbi_ue_df = aa_fbi_ue_df.reset_index()

In [53]:
aa_fbi_ue_df.hvplot.scatter(x="Unemp_Yr_Avg", y="aggravated_assault", by="Class")

In [54]:
fig = px.scatter_3d(
    aa_fbi_ue_df,
    x="Unemp_Yr_Avg",
    y="aggravated_assault",
    z="population",
    color="Class",
    symbol="Class",
    width=800,
    )
fig.update_layout(legend=dict(x=0, y=1))
fig.show()