In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from keras.layers import Input, Dense
from keras.models import Model
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "browser"

In [64]:
dff = pd.read_csv('data.csv')
df = dff[dff['year']==2024].copy()
df['population'] = df['population'].astype(int)

In [65]:
def evaluate_model(X, labels):
    if len(set(labels)) > 1:  
        score = silhouette_score(X, labels)
    else:
        score = -1  
    return score

In [66]:
def plot_clusters_interactive(data, labels, column_name):
    data['Assigned Cluster'] = labels
    fig = px.scatter(data, x='name', y=column_name, color='Assigned Cluster',
                     title=f"Interactive Clustering of Countries by {column_name}",
                     labels={'name': 'Country', column_name: column_name},
                     template='plotly_white')
    fig.update_layout(showlegend=True)
    fig.show(renderer="browser")  

In [67]:
def autoencoder_clustering(X_scaled):
    input_dim = X_scaled.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(16, activation='relu')(input_layer)
    encoder = Dense(8, activation='relu')(encoder)
    encoder_output = Dense(3, activation='relu')(encoder)
    decoder = Dense(8, activation='relu')(encoder_output)
    decoder = Dense(16, activation='relu')(decoder)
    decoder_output = Dense(input_dim)(decoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder_output)
    autoencoder.compile(optimizer="adam", loss='mse')
    autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=32, verbose=0)
    encoded_X = Model(inputs=input_layer, outputs=encoder_output).predict(X_scaled)
    return encoded_X

In [68]:
target_columns = [ 'Number of new HIV infections', 'Tobacco use%', 'Alcohol consumption','population', 'Prevalence of hypertension%']


In [69]:
results = {}


In [None]:
for target in target_columns:
    data = df.dropna(subset=[target]).copy()
    
    if target == 'population':
        X = np.log(df[['population']] + 1)  # due to outliers 
    else:
       X = data[[target]]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Clustering with autoencoder + KMeans
    encoded_X = autoencoder_clustering(X_scaled)
    kmeans = KMeans(n_clusters=3, random_state=42)
    autoencoder_labels = kmeans.fit_predict(encoded_X)
    autoencoder_score = evaluate_model(X_scaled, autoencoder_labels)
    print(f'For {target} autoencoders achived {autoencoder_score}')
    
    # Clustering with DBSCAN
    dbscan = DBSCAN(eps=0.8, min_samples=3)
    dbscan_labels = dbscan.fit_predict(X_scaled)
    dbscan_score = evaluate_model(X_scaled, dbscan_labels)
    print(f'For {target} DBSCAN achived {autoencoder_score}')

    
    # Clustering with KMeans directly on scaled data
    kmeans_labels = KMeans(n_clusters=3, random_state=42).fit_predict(X_scaled)
    kmeans_score = evaluate_model(X_scaled, kmeans_labels)
    print(f'For {target} kmeans achived {autoencoder_score}')
    
    # Choose the best model based on silhouette score
    scores = {'AutoencoderKMeans': autoencoder_score, 'DBSCAN': dbscan_score, 'KMeans': kmeans_score}
    best_model_name = max(scores, key=scores.get)
    best_score = scores[best_model_name]
     # Get the corresponding best model labels
    if best_model_name == 'AutoencoderKMeans':
        best_labels = autoencoder_labels
    elif best_model_name == 'DBSCAN':
        best_labels = dbscan_labels
    elif best_model_name == 'KMeans':
        best_labels = kmeans_labels

    # Save results if valid clustering
    if best_score > 0:
        data['Assigned Cluster'] = best_labels
        results[target] = {'model': best_model_name, 'score': best_score, 'labels': best_labels}
        
        # Save assigned clusters to a new CSV file
        data.to_csv(f'clusters_{target}.csv', index=False)
        
        plot_clusters_interactive(data, best_labels, target)
    else:
        print(f"Clustering failed for {target} (Silhouette Score = {best_score}).")


## Why these models?

The clustering models applied to the dataset were a strong choice because they allowed for the identification of inherent groupings within complex health and demographic data. 
1. **KMeans** is highly effective for partitioning data based on numerical attributes like population, health expenditure, and lifestyle factors, which are continuous and varied across countries. 
2. **DBSCAN**, being density-based, efficiently handled noise and outliers in regions with sparse data points, making it suitable for unevenly distributed health data. 
3. **Autoencoders**, as unsupervised neural networks, provided dimensionality reduction and highlighted latent structures in the dataset, which were crucial for analyzing non-linear relationships. 
<br><br><br>
Together, these models provided a comprehensive understanding of how countries cluster based on health indicators and socio-economic factors, enhancing the accuracy of subsequent analyses and predictions.

## Results

In [4]:
for target, result in results.items():
    print(f"Best model for {target}: {result['model']} with silhouette score: {result['score']}")

Best model for Number of new HIV infections: Autoencoders with silhouette score: 0.8572
Best model for Tobacco use%: DBSCAN with silhouette score: 0.837
Best model for Alcohol consumption: DBSCAN with silhouette score: 0.7947
Best model for Population: Autoencoders with silhouette score: 0.7818
Best model for Prevalence of hypertension%: Autoencoders with silhouette score: 0.8084


In [10]:
pd.DataFrame(results).to_csv('clusteringModelscomparison/clustering_model_scores.csv', index=False)

'clustering_model_scores.csv'

In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

file_path = "clusteringModelscomparison/clustering_model_scores.csv"
data = pd.read_csv(file_path)

features = data["Feature"].tolist()
models = data.columns[1:]
scores = data[models].to_numpy()

for i, feature in enumerate(features):
    plt.figure(figsize=(8, 6))
    plt.bar(models, scores[i], color=["#FF6F61", "#6B5B95", "#88B04B"])
    plt.title(f"Model Scores for {feature}")
    plt.ylabel("Score")
    plt.ylim(0, 1.2)
    plt.savefig(f"clusteringModelscomparison/barplot_{feature.replace('%', 'pct').replace(' ', '_')}.png")
    plt.close()




x = np.arange(len(features))  
width = 0.2  
plt.figure(figsize=(12, 8))
for i, model in enumerate(models):
    plt.bar(x + i * width, scores[:, i], width, label=model)
plt.title("Model Scores for Each Feature")
plt.xticks(x + width, features, rotation=45, ha="right")
plt.ylabel("Score")
plt.ylim(0, 1.2)
plt.legend(title="Models")
plt.savefig("clusteringModelscomparison/grouped_barplot.png")
plt.close()




plt.figure(figsize=(10, 6))
for i, model in enumerate(models):
    plt.plot(features, scores[:, i], marker="o", label=model)

plt.title("Trend of Model Performance Across Features")
plt.xticks(rotation=45)
plt.ylabel("Score")
plt.ylim(0, 1.2)
plt.legend(title="Models")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig("clusteringModelscomparison/lineplot_trends.png")
plt.close()
