In [1]:
pip install pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [2]:
pip install plotly



In [3]:
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import pycountry
from scipy.stats import spearmanr


# Load the dataset
dataset = pd.read_csv('2022.csv', decimal=',')


# We assume that 'dataset' is already loaded and clean.
# We also assume that 'Happiness score' and 'RANK' are in 'dataset'.



# Define a function that performs the clustering process and displays the map
def cluster_and_plot(data, feature_x, feature_y):
    # Preprocessing: remove NaNs and filter the necessary columns
    data = data.dropna(subset=[feature_x, feature_y])
    X = data[[feature_x, feature_y]].values

    # Train the K-Means model with the optimal number of clusters already determined
    kmeans = KMeans(n_clusters=5, init='k-means++', n_init=10, random_state=42)
    clusters = kmeans.fit_predict(X)
    data = data.copy()  # Make an explicit copy to suppress SettingWithCopyWarning
    data.loc[:, 'Cluster'] = clusters


    # Apply the function to the 'Country' column to create a new column 'iso_alpha'
    # Ignore countries with misspelled or unrecognized names
    def safe_get_iso_alpha_3(country):
        try:
            return pycountry.countries.lookup(country).alpha_3
        except LookupError:
            return None

    data['iso_alpha'] = data['Country'].apply(safe_get_iso_alpha_3)

    # Remove rows where 'iso_alpha' is None
    data = data.dropna(subset=['iso_alpha'])

    # Create the map with Plotly using the new 'iso_alpha' column
    fig = px.choropleth(data,
                        locations='iso_alpha',
                        color='Cluster',
                        hover_name='Country',
                        color_continuous_scale=px.colors.qualitative.Set1,
                        projection='natural earth')
    fig.update_layout(title=f'Clustering of Countries by {feature_x} and {feature_y}',
                      geo=dict(showframe=False, showcoastlines=False))
    fig.show()

# Call the function with different combinations of features
cluster_and_plot(dataset, 'Explained by: GDP per capita', 'Explained by: Social support')
cluster_and_plot(dataset, 'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices')
cluster_and_plot(dataset, 'Explained by: Generosity', 'Explained by: Perceptions of corruption')

# Function to perform clustering and evaluate the correlation with the happiness ranking
def analyze_clusters(data, feature_x, feature_y):
    data = data.dropna(subset=[feature_x, feature_y])
    X = data[[feature_x, feature_y]].values
    # Clustering
    kmeans = KMeans(n_clusters=5, init='k-means++', n_init=10, random_state=42)
    clusters = kmeans.fit_predict(X)
    data = data.copy()  # Make an explicit copy to suppress SettingWithCopyWarning
    data.loc[:, 'Cluster'] = clusters

    # Spearman correlation between the happiness ranking and cluster labels
    correlation, p_value = spearmanr(data['RANK'], data['Cluster'])

    print(f"Spearman correlation for {feature_x} and {feature_y}: {correlation} (p-value: {p_value})")

# Example of how to call the function
analyze_clusters(dataset, 'Explained by: GDP per capita', 'Explained by: Social support')
analyze_clusters(dataset, 'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices')
analyze_clusters(dataset, 'Explained by: Generosity', 'Explained by: Perceptions of corruption')

Spearman correlation for Explained by: GDP per capita and Explained by: Social support: -0.25247296879210923 (p-value: 0.0021093707678739547)
Spearman correlation for Explained by: Healthy life expectancy and Explained by: Freedom to make life choices: 0.44344826188021386 (p-value: 2.0750974820595874e-08)
Spearman correlation for Explained by: Generosity and Explained by: Perceptions of corruption: 0.07400801417008816 (p-value: 0.37466234910799623)
