In [None]:
# --- Import Libraries ---
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# --- Load Dataset ---
df = pd.read_csv("FINAL_USO_cleaned.csv")
print("Data Preview:")
print(df.head())

# --- Prepare Data ---
# Drop non-numeric columns (or encode them)
df_numeric = pd.get_dummies(df, drop_first=True)

# Scale features for better clustering performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_numeric)

# --- Apply K-Means Clustering ---
k = 3  # You can change this to try different numbers of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataset
df_numeric["Cluster"] = clusters

# --- Apply PCA for 2D Visualization ---
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
df_pca["Cluster"] = clusters

# --- Visualization ---
plt.figure(figsize=(8,6))
sns.scatterplot(
    data=df_pca,
    x="PC1", y="PC2",
    hue="Cluster",
    palette="Set2",
    s=80
)
plt.title("K-Means Clustering Results (2D PCA Projection)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.show()

# --- Optional: Check cluster centers ---
print("Cluster Centers (original feature space):")
print(pd.DataFrame(kmeans.cluster_centers_, columns=df_numeric.columns[:-1]))