In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [10]:
# Load the dataset
file_path = r"C:\\Users\\ZhuanZ\\Desktop\\Boston.csv"
df = pd.read_csv(file_path, index_col=0)

In [11]:
# Preview the first few rows to ensure the file loaded correctly
print("Data preview:\n", df.head())

Data preview:
       crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
1  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
2  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
3  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
4  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
5  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  medv  
1  396.90   4.98  24.0  
2  396.90   9.14  21.6  
3  392.83   4.03  34.7  
4  394.63   2.94  33.4  
5  396.90   5.33  36.2  


In [12]:
# Extract feature columns (excluding the target column 'medv')
X = df.drop(columns=['medv'])

In [13]:
# Standardize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
# Perform K-means clustering for k in the range of 2 to 6
k_values = range(2, 7)
silhouette_scores = []     # To store silhouette score for each k
inertias = []              # To store inertia (sum of squared errors) for each k
cluster_labels = {}        # To store clustering labels for each sample

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    cluster_labels[k] = labels
    silhouette_scores.append(silhouette_score(X_scaled, labels))
    inertias.append(kmeans.inertia_)



In [15]:
# Print silhouette scores for each k
print("Silhouette scores for different k values:")
for i in range(len(k_values)):
    print(f"k = {k_values[i]}: Silhouette Score = {silhouette_scores[i]:.4f}")

Silhouette scores for different k values:
k = 2: Silhouette Score = 0.3601
k = 3: Silhouette Score = 0.2448
k = 4: Silhouette Score = 0.2275
k = 5: Silhouette Score = 0.2389
k = 6: Silhouette Score = 0.2291


In [16]:
# Identify the optimal k (with the highest silhouette score)
optimal_k = k_values[np.argmax(silhouette_scores)]
print(f"\nOptimal number of clusters: {optimal_k} (Silhouette Score: {silhouette_scores[np.argmax(silhouette_scores)]:.4f})")


Optimal number of clusters: 2 (Silhouette Score: 0.3601)


In [17]:
# Retrieve optimal clustering labels and assign to the original DataFrame
optimal_labels = cluster_labels[optimal_k]
df['cluster'] = optimal_labels

In [18]:
# Calculate mean feature values by cluster (rounded to 2 decimal places)
cluster_means = df.groupby('cluster')[X.columns].mean().round(2)
print("\nFeature means for the optimal clustering (k={}):\n".format(optimal_k), cluster_means)


Feature means for the optimal clustering (k=2):
          crim     zn  indus  chas   nox    rm    age   dis    rad     tax  \
cluster                                                                     
0        0.26  17.48   6.89  0.07  0.49  6.46  56.34  4.76   4.47  301.92   
1        9.84   0.00  19.04  0.07  0.68  5.97  91.32  2.01  18.99  605.86   

         ptratio       b  lstat  
cluster                          
0          17.84  386.45   9.47  
1          19.60  301.33  18.57  


In [19]:
# Get the centroids of the KMeans model
optimal_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
optimal_kmeans.fit(X_scaled)
centroids_scaled = optimal_kmeans.cluster_centers_



In [20]:
# Convert standardized centroids to DataFrame
centroids_df = pd.DataFrame(centroids_scaled, columns=X.columns).round(2)
print("\nStandardized centroids of the optimal clustering (k={}):\n".format(optimal_k), centroids_df)


Standardized centroids of the optimal clustering (k=2):
    crim    zn  indus  chas   nox    rm   age   dis   rad   tax  ptratio     b  \
0 -0.39  0.26  -0.62  0.00 -0.58  0.24 -0.44  0.46 -0.58 -0.63    -0.29  0.33   
1  0.73 -0.49   1.15 -0.01  1.09 -0.45  0.81 -0.85  1.09  1.17     0.53 -0.61   

   lstat  
0  -0.45  
1   0.83  


In [21]:
# Inverse-transform centroids back to the original scale (rounded to 2 decimals)
centroids_original = scaler.inverse_transform(centroids_scaled)
centroids_original_df = pd.DataFrame(centroids_original, columns=X.columns).round(2)
print("\nCentroids in original scale for the optimal clustering (k={}):\n".format(optimal_k), centroids_original_df)


Centroids in original scale for the optimal clustering (k=2):
    crim     zn  indus  chas   nox    rm    age   dis    rad     tax  ptratio  \
0  0.26  17.48   6.89  0.07  0.49  6.46  56.34  4.76   4.47  301.92    17.84   
1  9.84   0.00  19.04  0.07  0.68  5.97  91.32  2.01  18.99  605.86    19.60   

        b  lstat  
0  386.45   9.47  
1  301.33  18.57  


In [22]:
# Compare feature means with centroid coordinates (in original scale)
print("\nDifference between cluster feature means and centroids (original scale):")
for cluster in range(optimal_k):
    print(f"\nCluster {cluster}:")
    diff = cluster_means.loc[cluster] - centroids_original_df.loc[cluster]
    print("Difference between feature means and centroids:")
    print(diff.round(4).to_string())


Difference between cluster feature means and centroids (original scale):

Cluster 0:
Difference between feature means and centroids:
crim       0.0
zn         0.0
indus      0.0
chas       0.0
nox        0.0
rm         0.0
age        0.0
dis        0.0
rad        0.0
tax        0.0
ptratio    0.0
b          0.0
lstat      0.0

Cluster 1:
Difference between feature means and centroids:
crim       0.0
zn         0.0
indus      0.0
chas       0.0
nox        0.0
rm         0.0
age        0.0
dis        0.0
rad        0.0
tax        0.0
ptratio    0.0
b          0.0
lstat      0.0
