In [None]:
# Core
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Unsupervised Learning
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Display
from IPython.display import display

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

print("Environment ready.")


In [None]:
df = pd.read_csv("urinalysis_cleaned.csv")

print("Dataset loaded.")
display(df.head())


In [None]:
cluster_features = [
    "pH",
    "Specific Gravity",
    "Protein",
    "Glucose",
    "Ketones",
    "Leukocytes",
    "Blood",
    "Nitrite",
    "Bacteria",
    "Crystals"
]

cluster_features = [c for c in cluster_features if c in df.columns]

X = df[cluster_features].dropna()

print("Features used for clustering:")
print(cluster_features)
print("Shape:", X.shape)


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
inertia = []
K = range(2, 9)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(K, inertia, marker="o")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal k")
plt.show()


In [None]:
silhouette_scores = {}

for k in range(2, 9):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    silhouette_scores[k] = silhouette_score(X_scaled, labels)

silhouette_df = pd.DataFrame.from_dict(
    silhouette_scores, orient="index", columns=["Silhouette Score"]
)

display(silhouette_df)


In [None]:
optimal_k = silhouette_df["Silhouette Score"].idxmax()

print("Selected number of clusters:", optimal_k)

kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

X_clustered = X.copy()
X_clustered["Cluster"] = clusters

display(X_clustered.head())


In [None]:
cluster_profiles = X_clustered.groupby("Cluster").mean()

display(cluster_profiles)


In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame({
    "PC1": components[:, 0],
    "PC2": components[:, 1],
    "Cluster": clusters
})

sns.scatterplot(
    x="PC1", y="PC2",
    hue="Cluster",
    data=pca_df,
    palette="tab10"
)
plt.title("PCA Projection of Urinalysis Clusters")
plt.show()


In [None]:
sns.countplot(x=clusters)
plt.title("Cluster Membership Distribution")
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.show()


In [None]:
display(cluster_profiles.round(2))


In [None]:
clustered_df = df.loc[X.index].copy()
clustered_df["Cluster"] = clusters

clustered_df.to_csv("urinalysis_clustered.csv", index=False)

print("Clustered dataset saved.")
