# **Analyzing African Mortality Rates In Africa through EDA and Machine Learning**

# Introduction

Africa continues to face significant public health challenges, with mortality rates influenced by factors such as disease burden, healthcare access, and economic disparities. Understanding these patterns is essential for driving targeted interventions and effective policymaking.

In this analysis, I will explore mortality rates across African countries using data on causes of death, age group distributions, population statistics, economic indicators, and access to healthcare. The goal is to uncover meaningful patterns and relationships that help explain variations in mortality rates across the continent.

Additionally, I will apply machine learning techniques to cluster countries based on their mortality profiles, with the aim of identifying high-risk nations. This can inform strategic health interventions and guide policy recommendations to improve health outcomes and reduce preventable deaths in the region. 

## Importing libraries and Loading Datasets

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd #for data manipulation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px #for interactive visualization
from sklearn.cluster import KMeans




# Define file paths for datasets (CSV or Excel)
files = {
    'MortalityCause': r"C:\Users\CSC 2020\Downloads\1. annual-number-of-deaths-by-cause.csv",
    'MortalitybyAge': r"C:\Users\CSC 2020\Downloads\2. number-of-deaths-by-age-group.csv",
    'Country': r"C:\Users\CSC 2020\Downloads\4. ISO 3166_country-and-continent-codes-list-csv.csv",
    'Worldpop': r"C:\Users\CSC 2020\Downloads\5. World Population.csv",
    'Medicals': r"C:\Users\CSC 2020\Downloads\3. Medical Doctors Per 10000 population.xlsx",
    'GDP': r"C:\Users\CSC 2020\Downloads\6. Current health expenditure (% of GDP).xlsx"
}

# Create an empty dictionary to store the dataframes
df = {}

# Loop through the files dictionary, load each file, and store it in the df dictionary
for name, path in files.items():
    # Check if the file is a CSV or Excel and load accordingly
    if path.endswith('.csv'):
        df[name] = pd.read_csv(path)
    elif path.endswith('.xlsx'):
        df[name] = pd.read_excel(path)

# print the dataframes to check that they loaded correctly
for name, data in df.items():
    print(f"Loaded {name} with shape: {data.shape}")

        

# Data Cleaning

### Mortality by Cause

In [None]:
df1 = df['MortalityCause']
df1.head() #preview of Mortality by causes dataframe

In [None]:
# Select columns to transpose
columns_to_transpose = df1.columns[3:37]

# Melt the dataframe to transform these columns into rows
melted_df1 = pd.melt(df1, id_vars=['Entity', 'Code', 'Year'], value_vars=columns_to_transpose, 
                    var_name='Cause', value_name='Value')
#checking for duplicates
melted_df1.duplicated().sum()  #no duplicate

#summary statistics
melted_df1.info() #incorrect data types of "value" column

# converting 'Value' column to numeric numeric
melted_df1['Value'] = pd.to_numeric(melted_df1['Value'], errors='coerce')

#checking for nulls
melted_df1.isnull().sum() #no null values

print(melted_df1['Value'].dtype) #checking datatype

melted_df1 = melted_df1.dropna(subset=['Value'])

melted_df1 = melted_df1.rename(columns={'Entity': 'Country'}) #renaming columns for consistency
null_code_rows = melted_df1[melted_df1['Code'].isnull()] # no null value
Mortality_cause =melted_df1 

In [None]:
Mortality_cause.head() #Preview of cleaned data

### Mortality by Age Group

In [None]:
df2 = df['MortalitybyAge']
df2.head() #preview of Mortality by Age group dataframe

In [None]:
#Transposing the Age columns into a single Column
columns_to_transpose = df2.columns[3:8]

melted_df2 = pd.melt(df2, id_vars=['Entity', 'Code', 'Year'], value_vars=columns_to_transpose, var_name='Age', value_name='Value')

melted_df2 = melted_df2.rename(columns={'Entity' : 'Country'}) #renaming country column for uniformity

#Checking for Duplicates 
melted_df2.duplicated().sum() #no duplicate

#Checking for nulls
melted_df2.isnull().sum() 

#checking for entity with null code
null_code_rows = melted_df2[melted_df2['Code'].isnull()]

#checking for Entities with null codes
grouped_null_code = null_code_rows.groupby('Country').size().reset_index(name='Value')
grouped_null_code.head() #no african country among null

Mortality_age_group = melted_df2.dropna()
Mortality_age_group.info()

In [None]:
Mortality_age_group.head() #Preview of cleaned data

### Medical Doctors per 10000 

In [None]:
df3 = df['Medicals']
df3.head() #Preview of Medical Doctors per 10000 dataframe

In [None]:
df3.columns = df3.iloc[1]  # Set the second row as the column names (header)

df3 = df3.drop([0, 1]).reset_index(drop=True)  # Drop the first two rows

df3.info() #checking for correct column datatype

#Correcting to appropriate datatype
df3[['FactValueNumeric', 'Value', 'Period']] = df3[['FactValueNumeric', 'Value', 'Period']].apply(pd.to_numeric, errors='coerce')

Medical_doctors = df3

Medical_doctors = Medical_doctors.rename(columns={'Location' : 'Country'}) #renaming column for uniformity

Medical_doctors.duplicated().sum()

Medical_doctors.head()

### Country Code

In [None]:
df4 = df['Country']
df4.head() #Preview of Country data frame

In [None]:
columns_to_drop = df4.columns[[3, 5]] #dropping redundant columns

df4 = df4.drop(columns_to_drop, axis=1) #removing redundant columns

# Renaming column for uniformity
renamed_column = df4.rename(columns={'Three_Letter_Country_Code': 'Code'}, inplace=True)
renamed_column = df4.rename(columns = {'Country_Name': 'Country'})

# Extract the first word before the comma for uniformity
renamed_column['Country'] = renamed_column['Country'].str.split(',').str[0]
renamed_column.head() 

# Filter df4 to include only rows where 'Continent_Name' is 'Africa'
df4_africa = renamed_column[renamed_column['Continent_Name'] == 'Africa']

df4_africa.head()

### World Population

In [None]:
df5 = df[ 'Worldpop'] #Preview of the World Population Dataset
df5.head()

In [None]:
World_Population = df5[(df5['Year'] >= 1990) & (df5['Year'] <= 2019)] #including only 'Year' between 1990 and 2019 inclusive

World_Population = World_Population.rename(columns = {'Entity': 'Country'})

World_Population.info()

World_Population.duplicated().sum() #no duplicate

World_Population.head()

### Percentage of GDP

In [None]:
df6 = df['GDP'] #Preview of GDP data
df6.head()

# Analysis and Visualisation

## Visualisation of Mortality Trend and Correlation Analysis

In [None]:
#setting row 4 as the correct dataframe Header
df6.columns = df6.iloc[3]

#removing irrelevant rows
df6 = df6.drop([0, 1, 2, 3]).reset_index(drop = True)

# Use the 'melt' function to reshape the DataFrame from wide to long format.
Percent_GDP = pd.melt(df6, id_vars= ['Country Name', 'Country Code', 'Indicator Name'], value_vars = df6.columns[3:25], var_name ='Years', value_name = 'Count')
Percent_GDP.info()

#renaming columns for consistency
Percent_GDP = Percent_GDP.rename(columns={'Country Name' : 'Country'})
Percent_GDP = Percent_GDP.rename(columns={'Country Code' : 'Code'})
Percent_GDP = Percent_GDP.rename(columns={'Years' : 'Year'})
Percent_GDP['Year'] = Percent_GDP['Year'].astype(int)

Percent_GDP.duplicated().sum() #no duplicates

Percent_GDP.head()

### Trend of Mortality over the Years

In [None]:
# Filter 'mortality_cause' for African countries and merge with 'country' dataframe
filtered_mortality = Mortality_cause[Mortality_cause['Code'].isin(df4_africa['Code'])]

# Step 2: Aggregate the filtered data by 'Year', summing the 'Value' column
total_mortality_africa = filtered_mortality.groupby('Year')['Value'].sum().reset_index()

# Visualising mortality trend over time
mortality_trend = px.line(total_mortality_africa, x='Year', y='Value', title='Total Mortality in African countries over the years', labels={'Value': 'Mortality Rate'})

mortality_trend.show()

The annual mortality trend shows a sharp increase in deaths in 1994, followed by a steady rise between 2003 and 2005, peaking at 9.3 million deaths.

### Populaton Vs Mortality

In [None]:
# Filter World_Population to include only African countries
filtered_population = World_Population[World_Population['Code'].isin(df4_africa['Code'])]

# Merge with mortality cause dataframe
Population_mortality = pd.merge(filtered_mortality,filtered_population, on=['Code', 'Year'])

# Group the data by 'Country' and 'Year' to consider both country and year for 'Value'
aggregated_data = Population_mortality.groupby(['Year','Country_x']).agg({
    'Value': 'sum',  # Sum the mortality values for each country and year
    'Population (historical estimates)': 'sum',
}).reset_index()

aggregated_data.head()

In [None]:
population_vs_mortality = px.scatter(aggregated_data, x=  'Population (historical estimates)', y= 'Value', 
    color = 'Country_x', hover_data=['Year'],  
    trendline="ols",  # Apply OLS trendline
    trendline_scope="overall",  # Apply trendline across the entire dataset
    labels={'Population (historical estimates)': 'Population', 'Value': 'Mortality'},
    title = 'Population vs Mortality'
)

population_vs_mortality.show()

The visualization above reveals a strong correlation between mortality and population across all African countries, with an R¬≤ value of 0.87. Additionally, we observe an outlier in **Rwanda**, which aligns with the sharp increase in mortality observed in 1994.


### Visualizing Outliers and Hidden Trends

The goal of this analysis is to identify countries with a negative correlation between population and mortality, highlighting the potential influence of other factors on mortality rates.

In [None]:
# Calculating correlation for each country
country_correlations = aggregated_data.groupby('Country_x').apply(
    lambda g: g['Value'].corr(g['Population (historical estimates)']) if g['Value'].std() > 0 and g['Population (historical estimates)'].std() > 0 else None,
    include_groups=False  # Exclude the grouping columns from the operation
).reset_index(name='Corr_coef')

# Categorizing correlation
country_correlations['Correlation_Group'] = country_correlations['Corr_coef'].apply(
    lambda x: 'Negative Correlation' if x < 0 else 'Positive Correlation' if x > 0 else 'No Correlation'
)

#Merging correlation values back to the original data
merged_data = pd.merge(
    aggregated_data, 
    country_correlations[['Country_x', 'Corr_coef', 'Correlation_Group']], 
    on='Country_x', 
    how='left'
)

# Select only the specific columns 
merged_data = merged_data[['Year', 'Country_x', 'Value', 
                           'Population (historical estimates)', 
                           'Corr_coef', 
                           'Correlation_Group']]

# Now merged_data should have only the desired columns
merged_data.head()

In [None]:
#Visualize the plot
population_vs_mortality = px.scatter(
  merged_data, 
    x='Population (historical estimates)', 
    y='Value', 
    color='Correlation_Group', 
    color_discrete_map={'Negative Correlation': 'red', 'Positive Correlation': 'blue', 'No Correlation': 'grey'},  
    hover_data=['Year', 'Country_x', 'Corr_coef'],  # Add 'Correlation' to hover data
    title='Population vs Mortality with Correlation Group'
)

population_vs_mortality.show()

From the visualization, we can observe countries that do not show a negative correlation between population and mortality, such as Ethiopia, which has an R¬≤ value of -0.9. Despite its high population, Ethiopia exhibits a low mortality rate. Other countries with similar patterns include Uganda, Tanzania, Rwanda, and Sudan. This observation highlights the importance of distinguishing correlation from causation‚Äîwhile population size may be correlated with mortality, other factors, such as healthcare access, infrastructure, and socioeconomic conditions, may be influencing mortality rates independently of population size.

## GDP Vs Mortality

In [None]:
# Filter GDP data to include only African countries
filtered_GDP = Percent_GDP[Percent_GDP['Code'].isin(df4_africa['Code'])]

# Merge the two dataframes 
GDP_mortality = pd.merge(filtered_mortality,filtered_GDP, on=['Code', 'Year'])

aggregated_data = GDP_mortality.groupby(['Year','Country_x']).agg({
    'Value': 'sum',  # Sum the mortality values for each country and year
    'Count': 'sum',
}).reset_index()

aggregated_data.head()  #visualizing the merged data

In [None]:
GDP_vs_mortality = px.scatter(aggregated_data, x=  'Count', y= 'Value', 
    color = 'Country_x', hover_data=['Year'], 
    trendline="ols",  # Apply OLS trendline
    trendline_scope="overall",  # Apply trendline across the entire dataset
    labels={'Count': 'GDP', 'Value': 'Mortality'},
    title = 'GDP vs Mortality'
)

GDP_vs_mortality.show()

from the above visual we can observe that there is generally no correlation between GDP and mortality

### Visualising Outliers

In [None]:
#Calculating correlation for each country (GDP vs Mortality)
country_correlations = aggregated_data.groupby('Country_x').apply(
    lambda g: g['Value'].corr(g['Count']) if g['Value'].std() > 0 and g['Count'].std() > 0 else None,
    include_groups=False  # Exclude the grouping columns from the operation
).reset_index(name='Correlation')

#Categorizing correlation
country_correlations['Correlation_Group'] = country_correlations['Correlation'].apply(
    lambda x: 'Negative Correlation' if x < 0 else 'Positive Correlation' if x > 0 else 'No Correlation'
)

# Renaming columns for uniformity
country_correlations = country_correlations.rename(columns={'Correlation_Group': 'Correlation_Group_from_correlations'})

#Merging correlation values back to the original data
aggregated_data = pd.merge(aggregated_data, 
                           country_correlations[['Country_x', 'Correlation', 'Correlation_Group_from_correlations']], 
                           on='Country_x', how='left', 
                           suffixes=('', '_from_correlations'))

# Plot GDP vs Mortality with Correlation Group and Correlation Coefficient in tooltip
GDP_vs_mortality = px.scatter(
    aggregated_data, 
    x='Count',  # GDP (as represented by 'Count')
    y='Value',  # Mortality
    color='Correlation_Group_from_correlations', 
    color_discrete_map={'Negative Correlation': 'red', 'Positive Correlation': 'blue', 'No Correlation': 'grey'},  
    hover_data=['Year', 'Country_x', 'Correlation'],  # Add 'Correlation' to tooltip
     labels={'Count': 'GDP', 'Value': 'Mortality'},
    title='GDP vs Mortality with Correlation Group'
)

GDP_vs_mortality.show()


From the visual country loke south africa has a strong correlation between gdp with and mortality with r2  at -0.8...

## Joining All Dataframes

In [None]:
# Step 1: Filter the 'Medical_doctors' table for rows where 'Indicators' is 'Medical_doctors (per 10000)'
filtered_medical_doctors = Medical_doctors[Medical_doctors['Indicator'] == 'Medical doctors (per 10,000)'] #representing access to Doctors as regards to Population
filtered_medical_doctors = filtered_medical_doctors.rename(columns={'ThreeLocCode': 'Code', 'Period': 'Year'}) # Renaming columns for Uniformity

df_merged = pd.merge(df4_africa, filtered_medical_doctors, 
                     on=['Code', 'Country'], how='inner')
df_merged = pd.merge(df_merged, Percent_GDP, 
                     on=['Code', 'Year', 'Country'], how='inner')
df_merged = pd.merge(df_merged, Mortality_cause , 
                     on=['Code', 'Year', 'Country'], how='inner')
df_merged = pd.merge(df_merged, Mortality_age_group, 
                     on=['Code', 'Year', 'Country'], how='inner')
df_merged = pd.merge(df_merged, World_Population, 
                     on=['Code', 'Year', 'Country'], how='inner')

df_merged.head()

## Visualising Correlation Matrix 

In [None]:
# Select the specific columns
df_selected = df_merged[['FactValueNumeric', 'Count', 'Population (historical estimates)', 'Value_y']].copy()

# Rename the columns
df_selected = df_selected.rename(columns={
    'FactValueNumeric': 'Healthcare access', 
    'Count': 'Percent_GDP', 
    'Population (historical estimates)': 'Population', 
    'Value_y': 'mortality count'
})

# Compute the correlation matrix
correlation_matrix = df_selected.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Selected Variables')
plt.show()

This matrix serves as a tool to identify key relationships that warrant further exploration using models or visualizations. For instance, the relationship between Population and Mortality (with a correlation coefficient of 0.46) may be a suitable candidate for K-means clustering, as it suggests a moderate positive association. This means that countries with larger populations tend to have higher mortality counts. The correlation coefficient indicates that about 46% of the variance in mortality can be attributed to population size. This suggests that population is a significant, but not sole, factor influencing mortality rates. The remaining variance may be driven by other factors such as healthcare access, GDP, or socioeconomic conditions, which could warrant further analysis through more complex models or visualizations.

# Machine Learning Algorithm (k-Means)

K-means clustering is a powerful method for grouping data based on similarities. In this analysis, we used K-means clustering to categorize countries based on three different pairs of variables:

* Population vs Mortality

* GDP vs Mortality

* Health Access vs Mortality

I applied the Elbow Method for each pair of variables to determine the optimal number of clusters. This method involves plotting the Within-Cluster Sum of Squares (WCSS) against the number of clusters, 
ùëò
k, and observing where the curve shows a sharp "elbow."

### 1. Clustering African Countries by Population and Mortality Rates

#### Determining Optimum number of Clusters (Elbow Method)

In [None]:
# Select the features you want to use
X = df_merged[['Population (historical estimates)', 'Value_y']].copy()

# Rename 'Value_y' to 'Mortality'
X = X.rename(columns={'Value_y': 'Mortality'})

# Initialize an empty list to store WCSS values
wcss = []

# Try a range of k values (e.g., from 1 to 10 clusters)
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS

# Plot WCSS vs. k
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--', color='b')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.show()


In [None]:
# KneeLocator is used to compute the point of inflection 
from kneed import KneeLocator
kl = KneeLocator(range(1, 11),
                  wcss,
                 curve="convex",
                 direction="decreasing")
print('The optimum number of clusters is: ' + str(kl.elbow))

In [None]:
# Fit the KMeans model with k=3
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(X)

# Add the cluster labels to your dataframe
df_merged['Cluster'] = kmeans.labels_

#check the cluster centers
print("Cluster Centers:")
print(kmeans.cluster_centers_)


In [None]:
# Group by the cluster label and calculate mean for each feature in each cluster
cluster_summary = df_merged.groupby('Cluster')[[ 'Population (historical estimates)', 'Value_y']].mean()

cluster_summary #Value_y is the Mortality Count

In [None]:
# Assuming df_merged has 'Country' and 'Cluster' columns
df_sunburst = df_merged[['Country', 'Cluster']].copy()  # Use .copy() to avoid potential issues

# Convert cluster labels to more descriptive names (optional)
cluster_labels = {
    0: "Low Population, Low Mortality",
    1: "High Population, High Mortality",
    2: "Low Population, Medium Mortality"
}

# Apply cluster labels using .loc
df_sunburst.loc[:, 'Cluster_Description'] = df_sunburst['Cluster'].map(cluster_labels)

# Create a sunburst chart
fig = px.sunburst(df_sunburst, 
                  path=['Cluster_Description', 'Country'], 
                  title='Country Clusters by Population and Mortality',
                  color='Cluster',
                  color_continuous_scale='Viridis')

# Show the plot
fig.show()


### 2. Clustering African Countries by Economic Strength and Mortality Rates

#### Determining Optimum number of Clusters (Elbow Method)

In [None]:
# Select the features you want to use
X = df_merged[['Count', 'Value_y']].copy() #GDP and Mortality Column

# Filling missing values with 0
X.fillna(value=0, inplace=True)

# Initialize an empty list to store WCSS values
wcss = []

# Try a range of k values (e.g., from 1 to 10 clusters)
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS

# Plot WCSS vs. k
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--', color='b')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.show()


In [None]:
k2 = KneeLocator(range(1, 11),
                  wcss,
                 curve="convex",
                 direction="decreasing")
print('The optimum number of clusters is: ' + str(kl.elbow))

In [None]:
# Fit the KMeans model with k=3
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(X)

# Add the cluster labels to your dataframe
df_merged['Cluster'] = kmeans.labels_

#check the cluster centers
print("Cluster Centers:")
print(kmeans.cluster_centers_)

In [None]:
# Group by the cluster label and calculate mean for each feature in each cluster
cluster_summary = df_merged.groupby('Cluster')[[ 'Count', 'Value_y']].mean()

cluster_summary #Value_y is Mortality

In [None]:
# Assuming df_merged has 'Country' and 'Cluster' columns
df_sunburst = df_merged[['Country', 'Cluster']].copy() 

# Convert cluster labels to more descriptive names (optional)
cluster_labels = {
    0: " High GDP, Low Mortality",
    1:  "Medium GDP, Medium Mortality",
    2: "Medium GDP, Very High Mortality"
}

# Apply cluster labels using .loc
df_sunburst.loc[:, 'Cluster_Description'] = df_sunburst['Cluster'].map(cluster_labels)

# Create a sunburst chart
import plotly.express as px
fig = px.sunburst(df_sunburst, 
                  path=['Cluster_Description', 'Country'], 
                  title='Country Clusters by GDP and Mortality',
                  color='Cluster',
                  color_continuous_scale='Viridis')

# Show the plot
fig.show()

### 3. Clustering African Countries by Healthcare Access and Mortality Rates

#### Determining Optimum number of Clusters (Elbow Method)

In [None]:
# Select the features you want to use
X = df_merged[['Value_x', 'Value_y']].copy() #Value_x is Healthcare Access in each country, Value_y is Mortality Count

# Initialize an empty list to store WCSS values
wcss = []

# Try a range of k values (e.g., from 1 to 10 clusters)
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS

# Plot WCSS vs. k
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--', color='b')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.show()


In [None]:
k3 = KneeLocator(range(1, 11),
                  wcss,
                 curve="convex",
                 direction="decreasing")
print('The optimum number of clusters is: ' + str(kl.elbow))

In [None]:
# Fit the KMeans model with k=3
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(X)

# Add the cluster labels to your dataframe
df_merged['Cluster'] = kmeans.labels_

#check the cluster centers
print("Cluster Centers:")
print(kmeans.cluster_centers_)

In [None]:
# Group by the cluster label and calculate mean for each feature in each cluster
cluster_summary = df_merged.groupby('Cluster')[[ 'Value_x', 'Value_y']].mean()

cluster_summary

In [None]:
# Assuming df_merged has 'Country' and 'Cluster' columns
df_sunburst = df_merged[['Country', 'Cluster']].copy() 

# Convert cluster labels to more descriptive names (optional)
cluster_labels = {
    0: " Medium access, Low Mortality",
    1:  "Low access, Medium Mortality",
    2: "Medium Access, Very High Mortality"
}

# Apply cluster labels using .loc
df_sunburst.loc[:, 'Cluster_Description'] = df_sunburst['Cluster'].map(cluster_labels)

# Create a sunburst chart
import plotly.express as px
fig = px.sunburst(df_sunburst, 
                  path=['Cluster_Description', 'Country'], 
                  title='Country Clusters by Medical Access and Mortality',
                  color='Cluster',
                  color_continuous_scale='Viridis')

# Show the plot
fig.show()