# COMP5152 World Happiness Report

Mount Google Drive. Import data (Put two .csv files under /content/drive/MyDrive/Colab Notebooks directory)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/Colab Notebooks'

# Libraries and Utilities

Here below lines we install libraries: 

*pycountry* to provides the ISO databases for the country name standards

*geopandas* to add support for geographic data to pandas objects

*kneed* is a repository to implement the kneedle algorithm

*pandas-visual-analysis* is a widget for dataset visualization

In [None]:
!pip install pycountry
!pip install geopandas
!pip install kneed
!pip install pandas-visual-analysis

In [None]:
# For data processing
import pandas as pd
import numpy as np

# For data visualization
from pandas_visual_analysis import VisualAnalysis
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
import os
import pycountry
import geopandas

# For Exploratory Data Analysis
from scipy.stats import ttest_ind
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

# For Modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import tensorflow as tf
from kneed import KneeLocator

In [None]:
happiness_2021_path = "world-happiness-report-2021.csv"
happiness_path = "world-happiness-report.csv"
happiness_2021_df = pd.read_csv(happiness_2021_path)
happiness_df = pd.read_csv(happiness_path)

#Data Preprocessing

## Data Validation and Filtering





In [None]:
# For world-happiness-report-2021.csv
VisualAnalysis(happiness_2021_df)
categorical = ["Country name", "Regional indicator"]
VisualAnalysis(happiness_2021_df, categorical_columns=categorical)

In [None]:
# For world-happiness-report.csv
VisualAnalysis(happiness_df)
categorical = ["Country name"]
VisualAnalysis(happiness_df, categorical_columns=categorical)

In [None]:
# Drop unused columns in 2 dataset
step_1_drops = ["Explained by: Log GDP per capita","Explained by: Social support",
                "Explained by: Healthy life expectancy","Explained by: Freedom to make life choices",
                "Explained by: Generosity","Explained by: Perceptions of corruption"]
happiness_2021_df.drop(columns=step_1_drops, inplace=True)

step_2_drops = ["Standard error of ladder score","upperwhisker", "lowerwhisker",
                "Ladder score in Dystopia","Dystopia + residual"]
happiness_2021_df.drop(columns=step_2_drops, inplace=True)

step_3_drops = ["Positive affect","Negative affect"]
happiness_df.drop(columns=step_3_drops, inplace=True)

## Data Organizing and Combination

In "world-happiness-report-2021.csv":
1. Rename "Logged GDP per capita" to "Log GDP per capita", "Healthy life expectancy" to "Healthy life expectancy at birth"
2. Add "Year" column with value 2021

In "world-happiness-report.csv":
1. Rename "Life Ladder" to "Ladder score", "year" to "Year"
2. Add "Regional indicator" column

Merge the 2 dataset to obtain dataset year range from 2005-2021, sort by country name and year


In [None]:
happiness_2021_df.rename(columns={"Logged GDP per capita": "Log GDP per capita",
                          "Healthy life expectancy": "Healthy life expectancy at birth"}, inplace=True)
happiness_2021_df.insert(2, "Year", 2021, True)

happiness_df.rename(columns={"Life Ladder": "Ladder score",'year': 'Year'}, inplace=True)
happiness_df.insert(1,'Regional indicator','')

df_merged = pd.concat([happiness_df, happiness_2021_df])
df_merged.sort_values(by=['Country name', 'Year'], ascending=[1, 0], inplace=True)
df_merged.head()

## Missing Values Handling

I. Missing categorial data:

1. Replace missing regional indicators value by looking up 2021 record data
2. For the unavailable 'regional indicator' values in 17 countries, we find the real region value from internet and fill in the dataset

In [None]:
uni_country_list=df_merged['Country name'].unique()
for country in uni_country_list:
    df_this_country = df_merged[df_merged['Country name']==country]
    if 2021 in df_this_country.Year.values:
        region = df_this_country[df_this_country['Year'] == 2021].iloc[0].values[1]
        df_merged.loc[df_merged['Country name']==country,['Regional indicator']] = region

In [None]:
df_merged.loc[(df_merged['Country name'] == 'Angola'),'Regional indicator']='Sub-Saharan Africa'
df_merged.loc[(df_merged['Country name'] == 'Belize'),'Regional indicator']='Latin America and Caribbean'
df_merged.loc[(df_merged['Country name'] == 'Bhutan'),'Regional indicator']='South Asia'
df_merged.loc[(df_merged['Country name'] == 'Central African Republic'),'Regional indicator']='Sub-Saharan Africa'
df_merged.loc[(df_merged['Country name'] == 'Congo (Kinshasa)'),'Regional indicator']='Sub-Saharan Africa'
df_merged.loc[(df_merged['Country name'] == 'Cuba'),'Regional indicator']='North America and ANZ'
df_merged.loc[(df_merged['Country name'] == 'Djibouti'),'Regional indicator']='Sub-Saharan Africa'
df_merged.loc[(df_merged['Country name'] == 'Guyana'),'Regional indicator']='Latin America and Caribbean'
df_merged.loc[(df_merged['Country name'] == 'Oman'),'Regional indicator']='Middle East and North Africa'
df_merged.loc[(df_merged['Country name'] == 'Qatar'),'Regional indicator']='Middle East and North Africa'
df_merged.loc[(df_merged['Country name'] == 'Somalia'),'Regional indicator']='Middle East and North Africa'
df_merged.loc[(df_merged['Country name'] == 'Somaliland region'),'Regional indicator']='Middle East and North Africa'
df_merged.loc[(df_merged['Country name'] == 'South Sudan'),'Regional indicator']='Sub-Saharan Africa'
df_merged.loc[(df_merged['Country name'] == 'Sudan'),'Regional indicator']='Sub-Saharan Africa'
df_merged.loc[(df_merged['Country name'] == 'Suriname'),'Regional indicator']='Latin America and Caribbean'
df_merged.loc[(df_merged['Country name'] == 'Syria'),'Regional indicator']='Middle East and North Africa'
df_merged.loc[(df_merged['Country name'] == 'Trinidad and Tobago'),'Regional indicator']='Latin America and Caribbean'
df_merged.head()

II. Missing numerical data:

1. Replace 6 factors' individual missing values using linear regression with their corresponding country's column values among 2005-2021
2. If available records for training the model <=2, replace the missing values by the corresponding country's column mean instead


In [None]:
# Replace 6 factors' missing values by regression with their corresponding year and countries
uni_country_list=df_merged['Country name'].unique()
for country in uni_country_list:
    df_this_country = df_merged[df_merged['Country name']==country]
    if df_this_country[df_this_country.columns[-6:]].isnull().values.any():
        for i in range(4,10):
            if (df_this_country[df_this_country.columns[i]].isnull().values.any()) and not(df_this_country[df_this_country.columns[i]].isnull().all()):
                col_name=df_this_country.columns[i]
                temp_train = df_this_country.loc[~df_this_country[col_name].isnull()]
                temp_test = df_this_country.loc[df_this_country[col_name].isnull()]
                if len(temp_train.index)<3 :
                    df_merged.loc[(df_merged['Country name']==country)&(df_merged[col_name].isnull()),[col_name]] = round(temp_train[col_name].sum()/len(temp_train.index),3)
                else:    
                    X_train = np.array(temp_train['Year']).reshape(-1,1)
                    Y_train = np.array(temp_train[col_name]).reshape(-1,1)
                    X_test = np.array(temp_test['Year']).reshape(-1,1)
                    yf_reg = LinearRegression().fit(X_train, Y_train)
                    Y_test = yf_reg.predict(X_test)
                    # Plot graph
                    plt.figure(figsize=(8,4))
                    plt.scatter(X_train, Y_train, color='black')
                    plt.scatter(X_test, Y_test, color='red')
                    plt.plot(X_test, Y_test, color='blue', linewidth=3)
                    plt.xlabel("Year", fontsize=10)
                    plt.ylabel(col_name, fontsize=10)
                    plt.xticks(fontsize=10)
                    plt.yticks(fontsize=10)
                    plt.title("Predict ''"+col_name+"'' in "+country)
                    plt.show()
                    # Update missing value
                    for i in range(len(temp_test.index)):
                        df_merged.loc[(df_merged['Country name']==country)&(df_merged['Year']==temp_test['Year'].iloc[i]),[col_name]] = round(Y_test[i][0],3)
  
  # For the scatter plots, red points are the predicted values for the missing data

III. Dataset review and record removal

1. Analysis for the countries and features that have the whole column missing situtation
2. Remove records in the dataset, which countries having >= 2/6 missing columns in the six evaluation factors



In [None]:
print("Countries with missing values for the whole column(s):")
print("------------------------------------------------------")
missing_country_list = []
for country in uni_country_list:
    df_this_country = df_merged[df_merged['Country name']==country]  
    if df_this_country[df_this_country.columns[-6:]].isnull().values.any():
        missing_country_list.append(country)
        temp_col=[]
        for i in range(4,10):
            if df_this_country[df_this_country.columns[i]].isnull().values.any():
                temp_col.append(df_this_country.columns[i])
        print(f"{country:<20}{', '.join(temp_col):<10}")

# Show the dataset with missing columns
df_merged[df_merged.isnull().any(axis=1)]

In [None]:
# Remove country records with missing columns >=2
# 13 records to be removed in total
df_merged = df_merged[~df_merged['Country name'].isin(missing_country_list)]

# Output df_merged as csv
merged_happiness_path = "merged-world-happiness-dataset.csv"
df_merged.to_csv(merged_happiness_path,index=False)

# Entire dataset has values after data preprocessing
print(df_merged.isnull().sum())
df_merged

# Insight

## Top 5 countries by ladder score mean during 2005-2021

In [None]:
df_merged[['Country name','Regional indicator','Ladder score']].groupby(['Country name','Regional indicator']).mean().sort_values(by='Ladder score',ascending=False)[:5]

## Bottom 5 countries by ladder score mean during 2005-2021

In [None]:
df_merged[['Country name','Regional indicator','Ladder score']].groupby(['Country name','Regional indicator']).mean().sort_values(by='Ladder score',ascending=True)[:5]

In [None]:
# inspiration ; https://www.kaggle.com/gaetanlopez/how-to-make-clean-visualizations
# changed code signif.

fig = plt.figure(figsize=(6,3),dpi=150)
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0.2, hspace=0.4)
ax0 = fig.add_subplot(gs[0, 0])

background_color = "#fafafa"
high_c = 'red'
low_c ='green'
fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) 

ax0.text(1.167,0.85,"World Happiness Index from 2005 to 2021",color='#323232',fontsize=28, fontweight='bold', fontfamily='sanserif',ha='center')
ax0.text(1.13,-0.35,"stand-out facts",color='lightgray',fontsize=28, fontweight='bold', fontfamily='monospace',ha='center')

ax0.text(0,0.4,"Denmark",color=high_c,fontsize=25, fontweight='bold', fontfamily='monospace',ha='center')
ax0.text(0,0.1,"Happiest",color='gray',fontsize=15, fontfamily='monospace',ha='center')

ax0.text(0.77,0.4,"7 of top 10",color=high_c,fontsize=25, fontweight='bold', fontfamily='monospace',ha='center')
ax0.text(0.75,0.1,"in Europe",color='gray',fontsize=15, fontfamily='monospace',ha='center')

ax0.text(1.5,0.4,"8 of bottom 10",color=low_c,fontsize=25, fontweight='bold', fontfamily='monospace',ha='center')
ax0.text(1.5,0.1,"in Africa",color='gray',fontsize=15, fontfamily='monospace',ha='center')

ax0.text(2.25,0.4,"Afghanistan",color=low_c,fontsize=25, fontweight='bold', fontfamily='monospace',ha='center')
ax0.text(2.25,0.1,"Unhappiest",color='gray',fontsize=15, fontfamily='monospace',ha='center')

ax0.set_yticklabels('')
ax0.set_xticklabels('')
ax0.tick_params(axis='both',length=0)

for s in ['top','right','left','bottom']:
    ax0.spines[s].set_visible(False)
    
import matplotlib.lines as lines
l1 = lines.Line2D([0.15, 1.95], [0.67, 0.67], transform=fig.transFigure, figure=fig,color = 'gray', linestyle='-',linewidth = 1.1, alpha = .5)
fig.lines.extend([l1])
l2 = lines.Line2D([0.15, 1.95], [0.07, 0.07], transform=fig.transFigure, figure=fig,color = 'gray', linestyle='-',linewidth = 1.1, alpha = .5)
fig.lines.extend([l2])
    
plt.show()

In [None]:
fig = plt.figure(figsize=(15,15),dpi=150)
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0.05, hspace=0.27)
ax0 = fig.add_subplot(gs[0, 0])


background_color = "#fafafa"
fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) 

hap = df_merged.groupby('Country name')['Ladder score'].mean().sort_values(ascending=False)[:10]
unhap = df_merged.groupby('Country name')['Ladder score'].mean().sort_values(ascending=True)[:10]
top_bottom = hap.append(unhap, ignore_index=False).sort_values(ascending=True)
# Plots 
# Happiest
data = top_bottom

color_map = ['#e7e9e7' for _ in range(20)]
color_map[0] = color_map[1] = color_map[2] =  low_c # color highlight
color_map[17] = color_map[18] = color_map[19] =  high_c 
#base
ax0.barh(data.index, 10, 
       edgecolor='darkgray',color='lightgray',alpha=0.1)
# actual
ax0.barh(data.index, data, 
       edgecolor='darkgray',color=color_map)

for i in range(0,20):
    ax0.annotate(list(data.index)[i], 
                   xy=(data[i]-(data[i]*0.01), i), 
                   va = 'center', ha='right',fontweight='light', fontfamily='monospace',fontsize=15, color='gray',rotation=0)
# diff color text
for i in range(0,3):
    ax0.annotate(list(data.index)[i], 
                   xy=(data[i]-(data[i]*0.01), i), 
                   va = 'center', ha='right',fontweight='light', fontfamily='monospace',fontsize=15, color='white',rotation=0)

for i in range(17,20):
    ax0.annotate(list(data.index)[i], 
                   xy=(data[i]-(data[i]*0.01), i), 
                   va = 'center', ha='right',fontweight='light', fontfamily='monospace',fontsize=15, color='white',rotation=0)
    

ax0.axes.get_xaxis().set_ticks([])
ax0.axes.get_yaxis().set_ticks([])


for s in ['top', 'bottom', 'right']:
    ax0.spines[s].set_visible(False)
    
ax0.text(0,22.5,'The Happiest & Unhappiest Countries in the World',fontfamily='sans-serif',fontsize=20,fontweight='bold',color='#323232')
ax0.text(0,21.3,'As per observations, countries from Scandinavia are the happiest whereas countries from Africa\nare the unhappiest. We will investigate how these countries differ with six factors provided.',fontfamily='monospace',fontsize=15,fontweight='light',color='gray')

plt.show()

# Data Visualization

In [None]:
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
print(world)
location = pd.read_csv('https://raw.githubusercontent.com/melanieshi0120/COVID-19_global_time_series_panel_data/master/data/countries_latitude_longitude.csv')
location.rename(columns={"name": "Country name"}, inplace=True)
#print(df_merged, location)
df_merged = pd.merge(df_merged, location, on="Country name", how="left")
#description = df_merged.describe().drop(columns=["Year"])
#df_merged.plot(column="Ladder score", scheme="quantiles", fig_size=(25,20), legend=True, cmap="coolwarm")
world.to_csv("world.csv")
location.to_csv("location.csv")
df_merged.to_csv("merged.csv")

## Data Distribution Visualization

### Countries and observation distribution by regional indicator

In [None]:
df_merged[['Country name','Regional indicator']].groupby(['Regional indicator']).agg(['count','nunique'])

### Bar Charts

In [None]:
df_merged.hist(figsize=(20,20))

In [None]:
df_merged.describe().T

In [None]:
fig=plt.figure(figsize=(15,8))
plt.title("Data distribution by Regional indicator",family='Serif', weight='bold', size=20)
sns.boxplot(x=df_merged['Ladder score'],y=df_merged['Regional indicator'])
plt.axvline(df_merged['Ladder score'].mean(), c='black',ls='--')
plt.text(x=df_merged['Ladder score'].mean(),y=6.5,s='Population mean', size=15, horizontalalignment='center')

## Over the time plots

In [None]:
background = "#fbfbfb"
low_c = '#dd4124'
high_c = '#009473'
fig, ax = plt.subplots(1,1, figsize=(10, 5),dpi=150)
fig.patch.set_facecolor(background) # figure background color
ax.set_facecolor(background)

# Reduced list as too many to show all at once 
top_list_ = df_merged.groupby('Country name')['Ladder score'].mean().sort_values(ascending=False).reset_index()[:20].sort_values(by='Ladder score',ascending=True)


plot = 1
for country in top_list_['Country name']:
    country_df = df_merged[df_merged['Country name'] == country]
    mean = country_df.groupby('Country name')['Ladder score'].mean()
    mean_2021 = country_df[country_df['Year'] == 2021].groupby('Country name')['Ladder score'].mean()
    # historic scores
    sns.scatterplot(data=df_merged[df_merged['Country name'] == country], y=plot, x='Ladder score',color='lightgray',s=50,ax=ax)
    # # mean score
    sns.scatterplot(data=df_merged[df_merged['Country name'] == country], y=plot, x=mean,color='gold',ec='black',linewidth=1,s=75,ax=ax)
    #2021 score
    sns.scatterplot(data=df_merged[df_merged['Country name'] == country], y=plot, x=mean_2021,color=high_c,ec='black',linewidth=1,s=75,ax=ax)   
    plot += 1


ax.set_yticks(top_list_.index+1)
ax.set_yticklabels(top_list_['Country name'][::-1], fontdict={'horizontalalignment': 'right'}, alpha=0.7)
ax.tick_params(axis=u'both', which=u'both',length=0)
ax.set_xlabel("Happiness Index Score",fontfamily='monospace',color='gray')


for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
    
Xstart, Xend = ax.get_xlim()
Ystart, Yend = ax.get_ylim()

ax.hlines(y=top_list_.index+1, xmin=Xstart, xmax=Xend, color='gray', alpha=0.5, linewidth=.3, linestyles='--')
ax.set_axisbelow(True)
ax.text(6.25, Yend+4.3, 'Happiness Index Scores through the years', fontsize=17, fontweight='bold', fontfamily='DejaVu Sans',color='#323232')
ax.text(6.25, Yend+0.75,
'''
Countries scores do vary over time, however they remain  farily consistent
It is interesting that Finland's 2021 score is amongst the highest of all time
''', fontsize=12, fontweight='light', fontfamily='monospace',color='gray')

plt.annotate('2021\nscore', xy=(7.842, 19), xytext=(8.2, 11),
             arrowprops=dict(facecolor='steelblue',arrowstyle="->",connectionstyle="arc3,rad=.3"), fontsize=10,fontfamily='monospace',ha='center', color=high_c)

plt.annotate('Mean\nscore', xy=(7.6804, 20), xytext=(8.2, 16),
             arrowprops=dict(facecolor='steelblue',arrowstyle="->",connectionstyle="arc3,rad=.5"), fontsize=10,fontfamily='monospace',ha='center', color='gold')


plt.show()

In [None]:
X_transformed = df_merged.copy()
X_transformed = X_transformed.drop(columns=['Country name','Regional indicator'])

attribute_list = ['Ladder score', 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']

#print(X)
scaler = MinMaxScaler()
X_transformed[attribute_list]=scaler.fit_transform(X_transformed[attribute_list])
print(X_transformed)

Normalized attributes over the time

In [None]:
plt.figure(figsize=(12,7))
for column in attribute_list:
  sns.lineplot(data=X_transformed,x="Year",y=column,marker='o', label=column)
plt.title("Normalized attributes over the time")
# plt.legend(loc="upper center")
plt.yticks([]) 
plt.show()

Ladder score over time with Regional indicator

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(data=df_merged,x="Year",y="Ladder score", hue='Regional indicator',marker='o')
plt.title("Ladder score")
plt.show()

## Map visualization

Generate ladder scores on map by year

In [None]:
# Geopandas package needs a iso_3 code instead of Country name (e.g. The iso_3 for Hong Kong is 'HKA').
# Some country names in pycountry package are different from that in the happiness report.
# Conversion of country name is performed.
def countryToCode(countries):
    code_list = []
    for country in countries:
      if country == 'Congo (Brazzaville)':
        country = 'Congo, The Democratic Republic of the'
      if country == 'Congo (Kinshasa)':
        country = 'Congo'
      if country == 'Czech Republic':
        country = 'Czechia'
      if country == 'South Korea':
        country = 'Korea, Republic of'
      if country == 'Taiwan Province of China':
        country = 'Taiwan, Province of China'
      if country == 'Vietnam':
        country = 'Viet Nam'
      if country == 'Hong Kong S.A.R. of China':
        country = 'Hong Kong'
      if country == 'Laos':
        country = "Lao People's Democratic Republic"
      if country == 'North Cyprus':
        country = 'Cyprus'
      if country == 'Palestinian Territories':
        country = 'Palestine, State of'
      if country == 'Ivory Coast':
        country = "Côte d'Ivoire"
      if country == 'Swaziland':
        country = 'Eswatini'
      if country == 'Bolivia':
        country = 'Bolivia, Plurinational State of'
      if country == 'Iran':
        country = 'Iran, Islamic Republic of'
      if country == 'Moldova':
        country = 'Moldova, Republic of'
      if country == 'Russia':
        country = 'Russian Federation'
      if country == 'Syria':
        country = 'Syrian Arab Republic'
      if country == 'Tanzania':
        country = 'Tanzania, United Republic of'
      if country == 'Venezuela':
        country = 'Venezuela, Bolivarian Republic of'
      try:
        code=pycountry.countries.get(name=country).alpha_3
        code_list.append(code)
      except:
        code_list.append('None')
    return code_list


# To load the world map and fix bugs for some countries
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.loc[world['name'] == 'France', 'iso_a3'] = 'FRA'
world.loc[world['name'] == 'Norway', 'iso_a3'] = 'NOR'
world.loc[world['name'] == 'N. Cyprus', 'iso_a3'] = 'CYP'
world.loc[world['name'] == 'Somaliland', 'iso_a3'] = 'SOM'
world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS'
world.columns=['pop_est', 'continent', 'name', 'CODE', 'gdp_md_est', 'geometry']

# set the color scale
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['red', 'yellow', 'green'])

def plot_map(dataset, column, subtitle_year, save=False):
  merge=pd.merge(world,dataset,on='CODE')
  ax = world.plot(figsize=(20,10), linewidth=0.25, edgecolor='white', color='lightgray')
  ax.axis('off')
  ax.set_facecolor('white')
  ax.text(-175,112,'The happiest & unhappiest countries',fontsize=30,fontweight='bold')
  ax.text(-175,102, '{} in {}'.format(column, subtitle_year), color='gray',fontsize=20)
  merge.plot(column=column,figsize=(20, 15),legend=True,cmap=cmap,ax=ax)
  if save:
    if not os.path.isdir('image'):
      os.mkdir('image')
    plt.savefig('image/{}_{}.png'.format(column, subtitle_year))

# Averaging the ladder score for each country
df_merged_grouped = df_merged.groupby('Country name').mean().reset_index()
df_geo = df_merged_grouped.copy()
df_geo['CODE']=countryToCode(df_geo['Country name'])
for column in attribute_list:
  plot_map(df_geo, column, '2005-2021', save=True)

for year in range(2005, 2022):
  for column in attribute_list:
    df_merged_temp = df_merged[df_merged['Year'] == year]
    df_merged_temp['CODE']=countryToCode(df_merged_temp['Country name'])
    plot_map(df_merged_temp, column, year, save=True)



Animate ladder scores on map by year

In [None]:
# to animate the plots versus time
import cv2
import os

fps = 1
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
print(os.getcwd())
# os.chdir('image')
files = os.listdir()
files.sort()

for column in attribute_list:
  temp_files = []
  for file in files:
    if column in file and file.endswith('.png'):
      temp_files.append(file)
  if not os.path.isdir('video'):
    os.mkdir('video')
  video_name = 'video/' + column + '.mp4'
  videoWrite = cv2.VideoWriter(video_name, fourcc, fps, (1440, 720))
  for file in temp_files:
    img = cv2.imread(file)
    videoWrite.write(img)
  videoWrite.release()
%cd '/content/drive/MyDrive/Colab Notebooks'

# Data Analysis

## Columns Relationship study - Pearson’s correlation coefficient

In [None]:
corrPearson = df_merged.corr(method="pearson") 
corrSpearman = df_merged.corr(method="spearman") 
figure = plt.figure(figsize=(25,10)) 
sns.heatmap(corrPearson,annot=True,vmin=-1,center=0,vmax=1) 
plt.title("PEARSON") 
plt.xlabel("COLUMNS") 
plt.xticks(fontsize=10, rotation=45) 
plt.ylabel("COLUMNS") 
plt.show() 

## Pairwise Relationship Plots

In [None]:
hap = df_merged.groupby('Country name')['Ladder score','Log GDP per capita','Social support','Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption'].mean().sort_values(by='Ladder score',ascending=False)[:10]
hap.insert(0,'Happiness','Happiest')
unhap = df_merged.groupby('Country name')['Ladder score','Log GDP per capita','Social support','Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption'].mean().sort_values(by='Ladder score',ascending=True)[:10]
unhap.insert(0,'Happiness','Unhappiest')

extreme_df_merge = pd.concat([hap,unhap])

sns.pairplot(extreme_df_merge, hue="Happiness")


## Pairwise comparison by regions - Hypothesis T-Test

In [None]:
def hypo_test(dataset,reg1,reg2):
    r1 = dataset[dataset['Regional indicator']==reg1]['Ladder score']
    r2 = dataset[dataset['Regional indicator']==reg2]['Ladder score']
    title="Hypothesis testing between regions: {} vs {}".format(reg1,reg2)
    stats, p_value= ttest_ind(r1, r2, equal_var=False)

    # if p_value <0.05:
    #     result="Since P value: {} is < 0.05, we can reject the Null Hypothesis and concluded there are significant difference".format(np.round(p_value,3))
    # else:
    #     result="P value: {} is >0.05, we can't reject the Null Hypothesis and concluded there are no difference".format(np.round(p_value,3))
    # print(title)
    # print(result + '\n')
    return p_value

regions = list(df_merged['Regional indicator'].unique())

list1 = []
for i in regions:
  list2 = []
  for j in regions:
    list2.append(hypo_test(df_merged,i,j))
  list1.append(list2)

p_value_matrix = pd.DataFrame(list1, columns=regions, index=regions)
figure = plt.figure(figsize=(25,10)) 
sns.heatmap(p_value_matrix,annot=True,vmin=-1,center=0,vmax=1)
plt.title("Hypothesis t-test p-value") 
plt.xlabel("Regional indicator") 
plt.xticks(fontsize=10, rotation=45) 
plt.ylabel("Regional indicator") 
plt.show() 

In [None]:
background = "#fbfbfb"
low_c = '#dd4124'
high_c = '#009473'

sample = df_merged[['Country name','Ladder score','Year']]

seven = sample[sample['Year'] == 2007]
seven.rename(columns={'Ladder score':'07_score'}, inplace=True)
twenty = sample[sample['Year'] == 2020]
twenty.rename(columns={'Ladder score':'20_score'}, inplace=True)

seven = seven.set_index('Country name')

twenty = twenty.set_index('Country name')

combined = seven.merge(twenty, on='Country name', how='inner', suffixes=('_1', '_2'))

combined['change'] = combined['20_score'] - combined['07_score']

top_inc = combined.groupby('Country name')['change'].mean().sort_values(ascending=False)[:10]
top_dec = combined.groupby('Country name')['change'].mean().sort_values(ascending=True)[:10]

top_changes = top_inc.append(top_dec, ignore_index=False).sort_values(ascending=True)

temp = combined.groupby('Country name')['07_score','20_score','change'].mean().sort_values(by='07_score',ascending=False).reset_index()

sample = temp[(temp['Country name'] == 'Bulgaria') | (temp['Country name'] == 'Latvia') | (temp['Country name'] == 'Kyrgyzstan') | (temp['Country name'] == 'Jordan') 
   | (temp['Country name'] == 'Georgia') | (temp['Country name'] == 'Mongolia') | (temp['Country name'] == 'Serbia') | (temp['Country name'] == 'Kosovo')
    | (temp['Country name'] == 'Estonia') | (temp['Country name'] == 'Hungary')]

sample = sample.set_index('Country name')


# https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/#18.-Slope-Chart

import matplotlib.lines as mlines

left_label = [str(c) + ', '+ str(round(y)) for c, y in zip(sample.index, sample['07_score'])]
right_label = [str(c) + ', '+ str(round(y)) for c, y in zip(sample.index, sample['20_score'])]
klass = [low_c if (y1-y2) < 0 else high_c for y1, y2 in zip(sample['07_score'], sample['20_score'])]

# draw line
# https://stackoverflow.com/questions/36470343/how-to-draw-a-line-with-matplotlib/36479941
def newline(p1, p2, color='black'):
    ax = plt.gca()
    l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color=low_c if p1[1]-p2[1] > 0 else high_c, marker='o', markersize=6)
    ax.add_line(l)
    return l

fig, ax = plt.subplots(1,1,figsize=(10,14), dpi= 150, facecolor=background_color)

ax.set_facecolor(background_color)

# Vertical Lines
ax.vlines(x=1, ymin=0, ymax=9, color='black', alpha=0.7, linewidth=1, linestyles='dotted')
ax.vlines(x=3, ymin=0, ymax=9, color='black', alpha=0.7, linewidth=1, linestyles='dotted')

# Points
ax.scatter(y=sample['07_score'], x=np.repeat(1, sample.shape[0]), s=75,linewidth=1.5, color='black', alpha=0.7)
ax.scatter(y=sample['20_score'], x=np.repeat(3, sample.shape[0]), s=75, linewidth=1.5,color='black', alpha=0.7)

# Line Segments and Annotation
for p1, p2, c in zip(sample['07_score'], sample['20_score'], sample.index):
    newline([1,p1], [3,p2])
    ax.text(1-0.05, p1, c + ', ' + str(round(p1,1)), horizontalalignment='right', verticalalignment='center', fontdict={'size':14})
    ax.text(3+0.05, p2, c + ', ' + str(round(p2,1)), horizontalalignment='left', verticalalignment='center', fontdict={'size':14})


ax.text(0,8.75,"Happiness Index scores 2007 - 2020", fontsize=20, fontfamily='sansserif',fontweight='bold',color='#323232')
ax.text(0,8.3, 
'''
Here we see how a random sample performed over the years, 
6 out of 10 biggest ascents seen from Central and Eastern Europe
(Bulgaria, Latvia, Serbia, Kosovo, Estonia, Hungary) ''', fontsize=12, fontweight='light', fontfamily='monospace',color='gray')


ax.set(xlim=(0,4), ylim=(3,8.2), ylabel='')
ax.set_xticks([1,3])
ax.set_xticklabels(["2007", "2020"],fontsize=20)
plt.yticks(np.arange(3, 8.2, 2), fontsize=12)

for s in ["top","right","left","bottom"]:
    ax.spines[s].set_visible(False)
    
ax.tick_params(axis='both',which='both',left=False,bottom=False,labelleft=False) 
plt.rcParams["font.family"] = "monospace"

plt.show()

# Simple Linear Regression

In [None]:
# Try to separate into different year and animate the plots
df_merged = pd.read_csv("merged-world-happiness-dataset.csv", engine='python')
regression_df = df_merged.copy()

for year in range(2005, 2022):
  temp_regression_df = regression_df[regression_df['Year'] == year]
  attributes = list(regression_df.columns)[4:]
  trainset = []
  testset = []

  for item in attributes:
    train_df = df_merged[["Ladder score", item]]
    temp_trainset, temp_testset = train_test_split(train_df, train_size=0.8)
    trainset.append(temp_trainset)
    testset.append(temp_testset)

  fitset = {}
  for k, item in enumerate(attributes):
    X_train = np.array(trainset[k][item], dtype=pd.Series).reshape(-1,1)
    Y_train = np.array(trainset[k]["Ladder score"], dtype=pd.Series)
    X_test = np.array(testset[k][item], dtype=pd.Series).reshape(-1,1)
    Y_test = np.array(testset[k]["Ladder score"], dtype=pd.Series)
    lr = LinearRegression()
    lr.fit(X_train, Y_train)
    fitset.update({item: [X_train, Y_train, X_test, Y_test, lr]})

  for column in ['Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']:
    X_train, Y_train, X_test, Y_test, lr = fitset[column]
    sns.set_style(style="whitegrid")
    plt.figure(figsize=(12,6))
    plt.scatter(X_train, Y_train, color="blue", label="Train set", s=12)
    plt.scatter(X_test, Y_test, color="orange", label="Test set", s=12)
    plt.plot(X_test, lr.predict(X_test), color="red", label="Linear Regression Model")
    plt.xlabel(column, fontsize=15)
    plt.ylabel("Ladder score", fontsize=15)
    R_sq = lr.score(X_test, Y_test)
    plt.annotate('R-sqaure = {}'.format(round(R_sq, 3)), xy=(0.5, 0.05) , xycoords='axes fraction')
    plt.xticks(fontsize=13)
    plt.yticks(fontsize=13)
    plt.legend()
    plt.title("Linear regression of Ladder score against {} in {}".format(column, year))
    if not os.path.isdir('image_linear_regression'):
      os.mkdir('image_linear_regression')
    plt.savefig('image_linear_regression/{}_{}.png'.format(column, year))



In [None]:
fps = 1
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
print(os.getcwd())
os.chdir('image_linear_regression')
files = os.listdir()
files.sort()
print(files)

for column in ['Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']:
  temp_files = []
  for file in files:
    if column in file and file.endswith('.png'):
      temp_files.append(file)
  print(temp_files)
  if not os.path.isdir('video'):
    os.mkdir('video')
  video_name = 'video/' + column + '.mp4'
  videoWrite = cv2.VideoWriter(video_name, fourcc, fps, (864, 432))
  for file in temp_files:
    img = cv2.imread(file)
    videoWrite.write(img)
  videoWrite.release()
%cd '/content/drive/MyDrive/Colab Notebooks'

In [None]:
df_merged = pd.read_csv("merged-world-happiness-dataset.csv", engine='python')
df_merged.head(5)

In [None]:
regression_df = df_merged.copy()
attributes = list(regression_df.columns)[4:]
trainset = []
testset = []

for item in attributes:
  train_df = df_merged[["Ladder score", item]]
  temp_trainset, temp_testset = train_test_split(train_df, train_size=0.8)
  trainset.append(temp_trainset)
  testset.append(temp_testset)

In [None]:
fitset = {}
for k, item in enumerate(attributes):
  X_train = np.array(trainset[k][item], dtype=pd.Series).reshape(-1,1)
  Y_train = np.array(trainset[k]["Ladder score"], dtype=pd.Series)
  X_test = np.array(testset[k][item], dtype=pd.Series).reshape(-1,1)
  Y_test = np.array(testset[k]["Ladder score"], dtype=pd.Series)
  lr = LinearRegression()
  lr.fit(X_train, Y_train)
  fitset.update({item: [X_train, Y_train, X_test, Y_test, lr]})

## Log GDP per capita

In [None]:
X_train, Y_train, X_test, Y_test, lr = fitset["Log GDP per capita"]

sns.set_style(style="whitegrid")

plt.figure(figsize=(12,6))
plt.scatter(X_train, Y_train, color="blue", label="Train set", s=12)
plt.scatter(X_test, Y_test, color="orange", label="Test set", s=12)
plt.plot(X_test, lr.predict(X_test), color="red", label="Linear Regression Model")
plt.xlabel("Log GDP per capita", fontsize=15)
plt.ylabel("Ladder score", fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.legend()
plt.title("Linear regression of Ladder score against Log GDP per capita")

In [None]:
# Review the linear regression model

print("The Linear model is: Ladder Score = {} + {} x Log GDP per capita".format(lr.intercept_, lr.coef_))
R_sq = lr.score(X_test, Y_test)
print("The R square = {}".format(R_sq))

## Social support

In [None]:
X_train, Y_train, X_test, Y_test, lr = fitset["Social support"]

sns.set_style(style="whitegrid")

plt.figure(figsize=(12,6))
plt.scatter(X_train, Y_train, color="blue", label="Train set", s=12)
plt.scatter(X_test, Y_test, color="orange", label="Test set", s=12)
plt.plot(X_test, lr.predict(X_test), color="red", label="Linear Regression Model")
plt.xlabel("Social support", fontsize=15)
plt.ylabel("Ladder score", fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.legend()
plt.title("Linear regression of Ladder score against Social support")

In [None]:
# Review the linear regression model

print("The Linear model is: Ladder Score = {} + {} x Social support".format(lr.intercept_, lr.coef_))
R_sq = lr.score(X_test, Y_test)
print("The R square = {}".format(R_sq))

## Healthy life expectancy at birth

In [None]:
X_train, Y_train, X_test, Y_test, lr = fitset["Healthy life expectancy at birth"]

sns.set_style(style="whitegrid")

plt.figure(figsize=(12,6))
plt.scatter(X_train, Y_train, color="blue", label="Train set", s=12)
plt.scatter(X_test, Y_test, color="orange", label="Test set", s=12)
plt.plot(X_test, lr.predict(X_test), color="red", label="Linear Regression Model")
plt.xlabel("Healthy life expectancy at birth", fontsize=15)
plt.ylabel("Ladder score", fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.legend()
plt.title("Linear regression of Ladder score against Healthy life expectancy at birth")

In [None]:
# Review the linear regression model

print("The Linear model is: Ladder Score = {} + {} x Healthy life expectancy at birth".format(lr.intercept_, lr.coef_))
R_sq = lr.score(X_test, Y_test)
print("The R square = {}".format(R_sq))

## Freedom to make life choices

In [None]:
X_train, Y_train, X_test, Y_test, lr = fitset["Freedom to make life choices"]

sns.set_style(style="whitegrid")

plt.figure(figsize=(12,6))
plt.scatter(X_train, Y_train, color="blue", label="Train set", s=12)
plt.scatter(X_test, Y_test, color="orange", label="Test set", s=12)
plt.plot(X_test, lr.predict(X_test), color="red", label="Linear Regression Model")
plt.xlabel("Freedom to make life choices", fontsize=15)
plt.ylabel("Ladder score", fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.legend()
plt.title("Linear regression of Ladder score against Freedom to make life choices")

In [None]:
# Review the linear regression model

print("The Linear model is: Ladder Score = {} + {} x Freedom to make life choices".format(lr.intercept_, lr.coef_))
R_sq = lr.score(X_test, Y_test)
print("The R square = {}".format(R_sq))

## Generosity

In [None]:
X_train, Y_train, X_test, Y_test, lr = fitset["Generosity"]

sns.set_style(style="whitegrid")

plt.figure(figsize=(12,6))
plt.scatter(X_train, Y_train, color="blue", label="Train set", s=12)
plt.scatter(X_test, Y_test, color="orange", label="Test set", s=12)
plt.plot(X_test, lr.predict(X_test), color="red", label="Linear Regression Model")
plt.xlabel("Generosity", fontsize=15)
plt.ylabel("Ladder score", fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.legend()
plt.title("Linear regression of Ladder score against Generosity")

In [None]:
# Review the linear regression model

print("The Linear model is: Ladder Score = {} + {} x Generosity".format(lr.intercept_, lr.coef_))
R_sq = lr.score(X_test, Y_test)
print("The R square = {}".format(R_sq))

## Perceptions of corruption

In [None]:
X_train, Y_train, X_test, Y_test, lr = fitset["Perceptions of corruption"]

sns.set_style(style="whitegrid")

plt.figure(figsize=(12,6))
plt.scatter(X_train, Y_train, color="blue", label="Train set", s=12)
plt.scatter(X_test, Y_test, color="orange", label="Test set", s=12)
plt.plot(X_test, lr.predict(X_test), color="red", label="Linear Regression Model")
plt.xlabel("Perceptions of corruption", fontsize=15)
plt.ylabel("Ladder score", fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.legend()
plt.title("Linear regression of Ladder score against Perceptions of corruption")

In [None]:
# Review the linear regression model

print("The Linear model is: Ladder Score = {} + {} x Perceptions of corruption".format(lr.intercept_, lr.coef_))
R_sq = lr.score(X_test, Y_test)
print("The R square = {}".format(R_sq))

# Multiple Linear Regression

In [None]:
df_merged = pd.read_csv('merged-world-happiness-dataset.csv', engine='python')
df_merged.head(5)

In [None]:
regression_df = df_merged.copy()
attributes = list(regression_df.columns)[4:]
train_df = regression_df[["Ladder score"] + attributes]
trainset, testset = train_test_split(train_df, train_size=0.8)

X_train = np.array(trainset[attributes], dtype=pd.Series).astype('float')
Y_train = np.array(trainset["Ladder score"], dtype=pd.Series).astype('float')
X_test = np.array(testset[attributes], dtype=pd.Series).astype('float')
Y_test = np.array(testset["Ladder score"], dtype=pd.Series).astype('float')

## Model Building

In [None]:
def build_model():
  input_layer = tf.keras.layers.Input(shape=6)
  #layer1 = tf.keras.layers.Dense(10, activation='relu')(input_layer)
  #layer2 = tf.keras.layers.Dense(10, activation='relu')(layer1)
  y_pred = tf.keras.layers.Dense(1)(input_layer)
  model = tf.keras.models.Model(inputs=input_layer, outputs=y_pred)
  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(1e-3),
                metrics='mse')
  model.summary()
  return model

def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error [Score]')
  plt.ylim((0,2))
  plt.legend()
  plt.grid(True)

In [None]:
model = build_model()

## Fitting Data into Model

In [None]:
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=2)
callback = [earlystop]
history = model.fit(X_train, Y_train, batch_size=8, epochs=1000, callbacks=callback,
          verbose=2, validation_data=(X_test, Y_test))

## Regression Result

In [None]:
plot_loss(history)

In [None]:
weights, bias = model.get_weights()
print('Happiness score = ',np.round(bias[0],4),
      '+',np.round(weights[0],4),'∗ GDP',
      '+',np.round(weights[1],4),'* Suport', 
      '+',np.round(weights[2],4),'* Health',
      '+',np.round(weights[3],4),'* Freedom',
       '+',np.round(weights[4],4),'* Generosity',
      '+',np.round(weights[5],4),'* Corrption')
y_pred = model.predict(X_test)

# Compute R square value
SSR = np.mean(np.square(y_pred-np.mean(Y_test)))
SST = np.mean(np.square(Y_test-np.mean(Y_test)))
R_sq = SSR/SST
print("The R square value: ", R_sq)

# Clustering

In [None]:
df_merged = pd.read_csv('merged-world-happiness-dataset.csv', engine='python')
df_merged.head(5)

In [None]:
clustering_df = df_merged.copy()
clustering_df = clustering_df[['Country name', 'Ladder score', 'Log GDP per capita','Social support', 
                               'Healthy life expectancy at birth', 'Freedom to make life choices',
                               'Generosity','Perceptions of corruption']]
X = clustering_df.drop(columns='Country name')
X.head(5)

## Standardization

In [None]:
scaler = StandardScaler()
X_transformed = scaler.fit_transform(X)
X_transformed = pd.DataFrame(data=X_transformed, columns=X.columns)
X_transformed.describe()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,4))

sns.histplot(X, kde=True, ax=ax1)
ax1.set_title('Before Standardization')

sns.histplot(X_transformed, kde=True, ax=ax2, legend=False)
ax2.set_title('After Standardization');

## Hyperparameter Tuning

In [None]:
ssd = []

for i in range(2,10):
  model = KMeans(n_clusters=i, random_state=4)
  model.fit(X_transformed)
  ssd.append(model.inertia_)

sns.lineplot(x=range(2,10), y=ssd, marker="o")
plt.title('Sum of squared distances of samples to their closest cluster center')
plt.xlabel('Number of clusters')
plt.ylabel('SSD');

4/5 is observed to be our optimal k.

In [None]:
kl = KneeLocator(range(2,10), ssd, curve="convex", direction="decreasing")
kl.elbow

## Clustering Result

In [None]:
model = KMeans(n_clusters=4, random_state=4)
cluster = model.fit_predict(X_transformed)

clustering_df['cluster'] = cluster
clustering_df.head(5)

In [None]:
fig = plt.figure(figsize=(15,10))
fig.add_subplot(311)
sns.kdeplot(clustering_df['Ladder score'], fill=True, hue=clustering_df.cluster, legend=True)

for i, column in enumerate(clustering_df.columns[2:8], 4):
    fig.add_subplot(3,3,i)
    sns.kdeplot(clustering_df[column], fill=True, hue=clustering_df.cluster, legend=False)

fig.tight_layout()
fig.subplots_adjust(top=0.85)
fig.suptitle('K-Means Clustering Result', fontsize=20, fontweight='bold', y=0.9);

In [None]:
# for visualisation purpose
clustering_df['Constant'] = 'Data'

In [None]:
fig = plt.figure(figsize=(18,5))
fig.suptitle('Strip plots for ladder score and each factors (K-means Clustering)', fontsize=20, fontweight='bold')
fig.add_subplot(171)
ax = sns.stripplot(x=clustering_df['Constant'],y=clustering_df['Ladder score'].values,hue=clustering_df['cluster'],jitter=True)
ax.set_title('Ladder score')
ax.set(xlabel=None)
ax.set(xticklabels=[])
ax.legend_.remove()

for i, column in enumerate(clustering_df.columns[2:8], 2):
    fig.add_subplot(1,7,i)
    ax = sns.stripplot(x=clustering_df['Constant'],y=clustering_df[column].values,hue=clustering_df['cluster'],jitter=True)
    ax.set_title(column)
    ax.set(xlabel=None)
    ax.set(xticklabels=[])
    if i != 7:
      ax.legend_.remove()

fig.tight_layout()
fig.subplots_adjust(top=0.85)

In [None]:
fig = plt.figure(figsize=(18,10))
fig.suptitle('Swarm plots for ladder score and each factors (K-Means Clustering)', fontsize=20, fontweight='bold', y=0.9)
fig.add_subplot(311)
ax = sns.swarmplot(x=clustering_df['Constant'],y=clustering_df['Ladder score'].values,hue=clustering_df['cluster'])
ax.set_title('Ladder score')
ax.set(xlabel=None)
ax.set(xticklabels=[])

for i, column in enumerate(clustering_df.columns[2:8], 4):
    fig.add_subplot(3,3,i)
    ax = sns.swarmplot(x=clustering_df['Constant'],y=clustering_df[column].values,hue=clustering_df['cluster'])
    ax.set_title(column)
    ax.set(xlabel=None)
    ax.set(xticklabels=[])
    ax.legend_.remove()

fig.tight_layout()
fig.subplots_adjust(top=0.85)

In [None]:
!pip install geopandas

In [None]:
import geopandas as gpd

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.head(5)

In [None]:
clustering_df.drop(columns='Constant', inplace=True)
clustering_df.head(5)

In [None]:
clustering_df = clustering_df.rename(columns={'Country name': 'name'})
clustering_map_df = pd.merge(world, clustering_df, on='name')

fig, ax = plt.subplots(figsize = (15, 5))
ax.set_title("Clusters of Countries (K Means Clustering Model)")
clustering_map_df.plot(column='cluster', ax = ax, legend=True, legend_kwds={'label': "cluster"});