In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from keras.layers import Input, Dense
from keras.models import Model
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "browser"

In [64]:
dff = pd.read_csv('data.csv')
df = dff[dff['year']==2024].copy()
df['population'] = df['population'].astype(int)

In [65]:
def evaluate_model(X, labels):
    if len(set(labels)) > 1:  
        score = silhouette_score(X, labels)
    else:
        score = -1  
    return score

In [66]:
def plot_clusters_interactive(data, labels, column_name):
    data['Assigned Cluster'] = labels
    fig = px.scatter(data, x='name', y=column_name, color='Assigned Cluster',
                     title=f"Interactive Clustering of Countries by {column_name}",
                     labels={'name': 'Country', column_name: column_name},
                     template='plotly_white')
    fig.update_layout(showlegend=True)
    fig.show(renderer="browser")  

In [67]:
def autoencoder_clustering(X_scaled):
    input_dim = X_scaled.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(16, activation='relu')(input_layer)
    encoder = Dense(8, activation='relu')(encoder)
    encoder_output = Dense(3, activation='relu')(encoder)
    decoder = Dense(8, activation='relu')(encoder_output)
    decoder = Dense(16, activation='relu')(decoder)
    decoder_output = Dense(input_dim)(decoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder_output)
    autoencoder.compile(optimizer="adam", loss='mse')
    autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=32, verbose=0)
    encoded_X = Model(inputs=input_layer, outputs=encoder_output).predict(X_scaled)
    return encoded_X

In [68]:
target_columns = [ 'Number of new HIV infections', 'Tobacco use%', 'Alcohol consumption','population', 'Prevalence of hypertension%']


In [69]:
results = {}


In [70]:
for target in target_columns:
    data = df.dropna(subset=[target]).copy()
    
    if target == 'population':
        X = np.log(df[['population']] + 1)  # due to outliers 
    else:
       X = data[[target]]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Clustering with autoencoder + KMeans
    encoded_X = autoencoder_clustering(X_scaled)
    kmeans = KMeans(n_clusters=3, random_state=42)
    autoencoder_labels = kmeans.fit_predict(encoded_X)
    autoencoder_score = evaluate_model(X_scaled, autoencoder_labels)
    print(f'For {target} autoencoders achived {autoencoder_score}')
    
    # Clustering with DBSCAN
    dbscan = DBSCAN(eps=0.8, min_samples=3)
    dbscan_labels = dbscan.fit_predict(X_scaled)
    dbscan_score = evaluate_model(X_scaled, dbscan_labels)
    print(f'For {target} DBSCAN achived {autoencoder_score}')

    
    # Clustering with KMeans directly on scaled data
    kmeans_labels = KMeans(n_clusters=3, random_state=42).fit_predict(X_scaled)
    kmeans_score = evaluate_model(X_scaled, kmeans_labels)
    print(f'For {target} kmeans achived {autoencoder_score}')
    
    # Choose the best model based on silhouette score
    scores = {'AutoencoderKMeans': autoencoder_score, 'DBSCAN': dbscan_score, 'KMeans': kmeans_score}
    best_model_name = max(scores, key=scores.get)
    best_score = scores[best_model_name]
     # Get the corresponding best model labels
    if best_model_name == 'AutoencoderKMeans':
        best_labels = autoencoder_labels
    elif best_model_name == 'DBSCAN':
        best_labels = dbscan_labels
    elif best_model_name == 'KMeans':
        best_labels = kmeans_labels

    # Save results if valid clustering
    if best_score > 0:
        data['Assigned Cluster'] = best_labels
        results[target] = {'model': best_model_name, 'score': best_score, 'labels': best_labels}
        
        # Save assigned clusters to a new CSV file
        data.to_csv(f'clusters_{target}.csv', index=False)
        
        plot_clusters_interactive(data, best_labels, target)
    else:
        print(f"Clustering failed for {target} (Silhouette Score = {best_score}).")


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step
For Number of new HIV infections autoencoders achived 0.7572727032070447
For Number of new HIV infections DBSCAN achived 0.7572727032070447
For Number of new HIV infections kmeans achived 0.7572727032070447
Opening in existing browser session.
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
For Tobacco use% autoencoders achived 0.49430171844731846
For Tobacco use% DBSCAN achived 0.49430171844731846
For Tobacco use% kmeans achived 0.49430171844731846
Opening in existing browser session.
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
For Alcohol consumption autoencoders achived 0.5642770787064374
For Alcohol consumption DBSCAN achived 0.5642770787064374
For Alcohol consumption kmeans achived 0.5642770787064374
Opening in existing browser session.
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
For population autoencoders achived 0.38184319739709

In [71]:
for target, result in results.items():
    print(f"Best model for {target}: {result['model']} with silhouette score: {result['score']}")


Best model for Number of new HIV infections: KMeans with silhouette score: 0.757864284860595
Best model for Tobacco use%: KMeans with silhouette score: 0.6251692130126828
Best model for Alcohol consumption: KMeans with silhouette score: 0.6003105278626577
Best model for population: KMeans with silhouette score: 0.5626040952314414
Best model for Prevalence of hypertension%: KMeans with silhouette score: 0.6186705491740943
