In [41]:
# Improting Libreries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import warnings
# warnings.filterwarnings('ignore')

from sklearn.ensemble import IsolationForest
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage

In [2]:
# Load the data
data = pd.read_excel("World_development_mesurement.xlsx")

In [3]:
# Columns contain currency symobls($, comma), convert them to numeric
currency_cols = ['GDP', 'Health Exp/Capita','Tourism Inbound', 'Tourism Outbound']

for col in currency_cols:
    data[col] = (data[col].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float))

In [4]:
# column Business Tax Rate is having %
data['Business Tax Rate']=data['Business Tax Rate'].astype(str).str.replace('%', '', regex=False).astype(float)

In [5]:
# think that the countary name is not useful for our clustring so droping the column, keeing it for interpretation
countries = data['Country']
data.drop(columns=['Country'], inplace=True)

In [6]:
# removing the coloumns based on the ratio of the missing values, with 45% missing ratio 
missing_value_ratio = data.isnull().mean()
data = data.loc[:, missing_value_ratio < 0.45]

In [7]:
# filling null values with meadin, because it's robust to outliers
data.fillna(data.median(numeric_only=True), inplace=True)

In [8]:
# removing the 'Number of Records' column because it's having constant value across the all the rows, such no-variance feature will not usefull
data.drop(columns=['Number of Records'], inplace=True)

In [9]:
# detecting outliers
iso = IsolationForest(contamination=0.05, random_state=42)
labels = iso.fit_predict(data)

In [10]:
# capping values insted removing
df_treated = data.copy()

for col in data.columns:
    lower = data[col].quantile(0.05)
    upper = data[col].quantile(0.95)
    df_treated[col] = np.clip(data[col], lower, upper)

In [11]:
# Since the data is skewed and contains extreme economic values in gdp, population etc, 
# used RobustScaler which is based on median and IQR instead of mean and standard deviation.
scaler = RobustScaler()
scaled_data = scaler.fit_transform(df_treated)
# After scaling, the data becomes a NumPy array and loses column names. So converting it back to a DataFrame to retain feature names.
scaled_df = pd.DataFrame(scaled_data, columns=data.columns)

In [30]:
df = scaled_df.copy()

In [40]:
# plot dendrogram to find number of clusters

# Create linkage matrix using Ward method
linkage_matrix = linkage(df, method='ward')

# Plot dendrogram
plt.figure(figsize=(12,6))
dendrogram(linkage_matrix)
plt.title("Dendrogram for Hierarchical Clustering")
plt.xlabel("Data Points")
plt.ylabel("Euclidean Distance")
plt.show()

In [32]:
# cutting dendrogram at euclidean distance of 175 and getting 3 clusters
# Bild Agglomerative model 

ag_model = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
ag_model.fit(df)

In [33]:
# add cluster labels to df 
ag_labels = ag_model.fit_predict(df)
df['AG_Cluster'] = ag_labels

In [34]:
# visualizing the cluster using PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(df.drop('AG_Cluster', axis=1))
pca_df = pd.DataFrame(pca_data, columns=['PC1', 'PC2'])
pca_df['Cluster'] = ag_labels

In [35]:
plt.figure(figsize=(8,6))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df, palette='tab10')
plt.title("Agglomerative Clusters Visualization")
plt.show()

In [36]:
# interpret the clusters
cluster_profile = df.groupby('AG_Cluster').mean()

In [37]:
# removing cluster columns
X = df.drop(['AG_Cluster'], axis=1)

In [38]:
# Agglomerative silhouette score
aglomerative_score = silhouette_score(X, df['AG_Cluster'])
aglomerative_score

0.477674302986937