In [None]:
# IMPORT LIBRARIES
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# LOAD DATASET
df = pd.read_csv("penguins.csv")

# Keep only numeric columns
X = df.select_dtypes(include=["float64", "int64"])

# DATA SCALING
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ELBOW METHOD (INERTIA)
inertia = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot elbow curve
plt.figure()
plt.plot(K_range, inertia, marker="o")
plt.xlabel("Number of clusters (K)")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

# RUN K-MEANS WITH OPTIMAL K
k_optimal = 3   # Change based on elbow graph

kmeans = KMeans(n_clusters=k_optimal, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels
df["Cluster"] = clusters

# VISUALIZE CLUSTERS (2 FEATURES)
plt.figure()
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap="viridis")
plt.xlabel(X.columns[0])
plt.ylabel(X.columns[1])
plt.title("K-Means Clusters")
plt.show()

# FINAL STATISTICAL DATAFRAME
cluster_stats = df["Cluster"].value_counts().sort_index()
print("\nCluster Statistics (number of points in each cluster):")
print(cluster_stats)

# FINAL CHARACTERISTIC TABLE
stat_penguins = df.groupby("Cluster").mean(numeric_only=True)
print("\nFinal Cluster Characteristics Table:")
print(stat_penguins)