In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn import metrics
import matplotlib.patches as mpatches

In [None]:
# Set font to display English and special characters correctly
plt.rcParams["font.family"] = ["DejaVu Sans", "Arial", "sans-serif"]
plt.rcParams["axes.unicode_minus"] = False  # Ensure minus sign displays correctly

# Read data
file_path = 'merged_ecological_wdi_normalized.csv'
data = pd.read_csv(file_path)

# Separate features and labels
labels = data[['COUNTRY', 'ISO']]
features = data.iloc[:, 2:]

# Handle missing values: fill missing values with 0
features = features.fillna(0)

# Standardize data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Calculate linkage matrix
Z = linkage(scaled_features, method='ward', metric='euclidean')

In [None]:
# Plot dendrogram
plt.figure(figsize=(20, 10))  # Increase figure size
dn = dendrogram(Z, labels=labels['COUNTRY'].values)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Countries')
plt.ylabel('Distance')

# Rotate x-axis labels
plt.xticks(rotation=90, fontsize=6)  # Rotate 90 degrees and reduce font size

# Try different max_d values
max_d = 150

# Generate clusters
clusters = fcluster(Z, max_d, criterion='distance')

# Calculate silhouette score
silhouette_avg = metrics.silhouette_score(scaled_features, clusters)
print(f"Silhouette Score: {silhouette_avg:.4f}")

# Create legend
unique_clusters = np.unique(clusters)
legend_handles = []
for cluster in unique_clusters:
    # Create a legend entry for each cluster
    color = plt.cm.tab20(cluster % 20)  # Use tab20 colormap, cycle through colors
    legend_handles.append(mpatches.Patch(color=color, label=f'Cluster {cluster}'))

# Add legend
plt.legend(handles=legend_handles, title='Clusters', loc='upper right', fontsize=10)


plt.tight_layout()  # Adjust layout
plt.show()

# Transform standardized data back to original scale
original_features = scaler.inverse_transform(scaled_features)

# Add clustering results to original data
labels['Cluster'] = clusters
result = pd.concat([labels, pd.DataFrame(original_features, columns=features.columns)], axis=1)

# Save results
result.to_csv('hierarchical_cluster_results3.csv', index=False)

# Print number of countries in each cluster
cluster_counts = pd.value_counts(clusters)
print("\nCluster Membership Statistics:")
for cluster, count in cluster_counts.items():
    print(f"Cluster {cluster}: {count} countries")