In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
import seaborn as sns

# Uncomment if you haven't already installed seaborn
# !pip install seaborn


pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", None)

In [None]:
with open("../data/processed/master_frame.pkl", "rb") as file:
    master_df = pickle.load(file)

master_df_clean = master_df.dropna().copy()
# Select relevant features for clustering
features = master_df_clean[["gdp", "emp_total_county_naics", "emp_occupation", "ap", "est", "gdp_fips", "gdp_naics"]]
print(master_df)
# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
# Use Elbow Method to determine the optimal number of clusters
wcss = []  # within-cluster sum of squares

for i in range(1, 11):
    kmeans = KMeans(
        n_clusters=i, init="k-means++", max_iter=300, n_init=10, random_state=0
    )
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)

# Plot the Elbow graph
plt.plot(range(1, 11), wcss)
plt.title("Elbow Method")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()

# From the plot we assume that the optimal number of clusters are 3

In [None]:
# Apply K-Means with the optimal number of clusters
kmeans = KMeans(n_clusters=4, init="k-means++", max_iter=300, n_init=10, random_state=0)
master_df_clean["Cluster"] = kmeans.fit_predict(scaled_features)

In [None]:
# View the clustered data
print(master_df_clean)

# Select numeric columns for the summary
numeric_cols = master_df_clean.select_dtypes(include=["number"]).columns

# Summary of clusters
cluster_summary = master_df_clean.groupby("Cluster")[numeric_cols].mean()
print(cluster_summary)

In [None]:
# Assuming master_df_clean and scaled_features are already defined

# Apply PCA to reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_features)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=["PC1", "PC2"])

# Add the cluster assignments to the PCA DataFrame
pca_df["Cluster"] = master_df_clean["Cluster"]

In [None]:
# Plot the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x="PC1", y="PC2", hue="Cluster", data=pca_df, palette="viridis", s=100, alpha=0.7
)
plt.title("Clusters Visualization with PCA")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.show()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns


# Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_features)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust eps and min_samples based on your data
clusters = dbscan.fit_predict(principal_components)

# Create a DataFrame with PCA components and cluster labels
pca_df = pd.DataFrame(data=principal_components, columns=["PC1", "PC2"])
pca_df["Cluster"] = clusters

# Visualize the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x="PC1", y="PC2", hue="Cluster", data=pca_df, palette="viridis", s=100, alpha=0.7
)
plt.title("Clusters Visualization with PCA (DBSCAN)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.show()