In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt


In [12]:
df = pd.read_csv('./datasets/cities_r2.csv')
X = df[['effective_literacy_rate_total']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
linkage_matrix = linkage(X_scaled, method='ward')

plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix, leaf_font_size=10, no_labels=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.axhline(y=10, c='red', linestyle='--', label='Cut-off line')
plt.legend()
plt.tight_layout()
plt.show()

In [17]:
n_clusters = 3
df['Cluster'] = fcluster(linkage_matrix, n_clusters, criterion='maxclust')

print("Cluster distribution:")
print(df['Cluster'].value_counts().sort_index())

plt.figure(figsize=(10, 6))
plt.scatter(df['effective_literacy_rate_total'], df['Cluster'], 
            c=df['Cluster'], cmap='viridis', s=100)
plt.xlabel('Effective Literacy Rate')
plt.ylabel('Cluster')
plt.title('Hierarchical Clustering Results')
plt.colorbar(label='Cluster')
plt.show()