In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.5f}".format

In [None]:
master_df_scaled = pd.read_pickle("../data/processed/master_df_scaled.pkl")
master_df = pd.read_pickle("../data/processed/master_df.pkl")

In [None]:
master_df_scaled.head()

In [None]:
inertia = []
for n in range(1, 11):
  kmeans = KMeans(n_clusters=n, random_state=42)
  kmeans.fit(master_df_scaled)
  inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia)
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal Number of Clusters")
plt.show

In [None]:
optimal_clusters = 4

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters  = kmeans.fit_predict(master_df_scaled)

master_df_scaled["clusters"] = clusters
master_df["clusters"] = clusters

In [None]:
sil_score = silhouette_score(master_df_scaled, clusters)
print(f"Silhoutte Score: {sil_score}")

In [None]:
pca = PCA(n_components=2)
pca_features = pca.fit_transform(master_df_scaled)

plt.scatter(pca_features[:, 0], pca_features[:, 1], c=clusters, cmap="viridis")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("Clusters of Counties")
plt.colorbar()
plt.show

In [None]:
cluster_analysis = master_df.groupby("clusters").mean()
print(cluster_analysis)

pd.to_pickle(master_df_scaled, "../data/processed/master_df_scaled.pkl")