In [1]:
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download daniboy370/world-data-by-country-2020
! unzip "world-data-by-country-2020.zip" -d ./world-data
!pip install wpca
!pip install hvplot
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading world-data-by-country-2020.zip to /content
  0% 0.00/20.8k [00:00<?, ?B/s]
100% 20.8k/20.8k [00:00<00:00, 19.2MB/s]
Archive:  world-data-by-country-2020.zip
  inflating: ./world-data/Fertility.csv  
  inflating: ./world-data/GDP per capita.csv  
  inflating: ./world-data/Life expectancy.csv  
  inflating: ./world-data/Meat consumption.csv  
  inflating: ./world-data/Median age.csv  
  inflating: ./world-data/Population growth.csv  
  inflating: ./world-data/Sex-ratio.csv  
  inflating: ./world-data/Suicide rate.csv  
  inflating: ./world-data/Urbanization rate.csv  
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wpca
  Downloading wpca-0.1.tar.gz (900 kB)
[K     |████████████████████████████████| 900 kB 4.1 MB/s 
[?25hBuilding wheels for collected packages: wpca
  Building wheel for wpca (setup.py) ... [?25l[?2

In [2]:
from scipy.special import hyp2f1

class Mean_pop_one_over_x:
  def __init__(self, lrate = 0.05, niter = 3):
    self.lrate = lrate
    self.niter = niter

  def fit(self,X,y,pop):
    self.a_1 = 0
    self.xmax_ = X.max()
    self.xmin_ = X.min()
    self.ymax_ = y.max()
    self.ymin_ = y.min()
    self.popmax_ = pop.max()

    for i in range(self.niter):
      for x, target, weight in zip(X,y,pop):
        closest_x = self.rootfinder(x,target)
        if target > self.a_1/x:
          self.a_1 += self.lrate*(self.ymax_+self.ymin_)/2*closest_x*weight/self.popmax_
        else:
          self.a_1 -= self.lrate*(self.ymax_+self.ymin_)/2*closest_x*weight/self.popmax_
    self.scale_index(X,y)
    return None
  
  def predict(self,x):
    return self.a_1/x

  def net_input(self,x):
    return self.a_1/x

  def scale_index(self,X,y):
    median_list = []
    for x, target in zip(X,y):
      closest_x = self.rootfinder(x,target)
      median_list.append(closest_x)
      self.index_one_ = self.arclength(np.max(median_list)) - self.arclength(np.min(median_list))
      self.index_zero_ = self.arclength(np.min(median_list))
    return self

  # you need to scale the data to get the correct residues
  def get_index_res(self,x,y):
    closest_x = self.rootfinder(x,y)
    closest_y = self.a_1/closest_x
    distance_x = (x - closest_x)
    distance_y = (y - closest_y)
    res = np.sqrt(distance_x**2+distance_y**2)
    index = (self.arclength(closest_x) - self.index_zero_)/self.index_one_
    return index, res

  def arclength(self,g):
    gg = g/np.sqrt(self.a_1)
    return -np.sqrt(self.a_1)/gg*(1+gg**4)**1.5*hyp2f1(1,5/4,3/4,-gg**4)
  
  def rootfinder(self,x,y):
    coeff = [1,-x,0,self.a_1*y,-self.a_1**2]
    roots = np.roots(coeff)
    delta = np.inf
    for root in np.roots(coeff):
      if (root.imag<0.001) & (np.abs(x - root) < delta):
        delta = np.abs(x - root)
        best = root
    return np.real(best)

In [3]:
import pandas as pd
import numpy as np
import panel as pn
pn.extension('tabulator')
import hvplot.pandas
import matplotlib.pyplot as plt
import holoviews as hv
import seaborn as sns
from  matplotlib.ticker import PercentFormatter
from scipy.stats import iqr
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline

import glob
files = glob.glob("./world-data/*.csv")

content = []

for filename in files:
    data = pd.read_csv(filename, index_col = None)
    content.append(data)

from functools import reduce
df_merged = reduce(lambda left, right: pd.merge(left,right, on=['ISO-code'], how = 'outer')\
              .drop(columns=['Country_x']).rename(columns={'Country_y':'Country'}), content)

df_merged = df_merged.drop(df_merged.loc[df_merged['ISO-code'] == "GIN"].index)\
        .drop_duplicates()\
        .drop(index=df_merged.loc[(df_merged['ISO-code'] == 'RUS') & (df_merged['Fertility'] == 1.82)].index)

pop = pd.read_xml("http://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?per_page=25000")
pop = pop.dropna(subset=['value'])
indices = pop.groupby('countryiso3code').date.transform('idxmax').unique()
indices = pd.Series(indices).dropna().values # drop na
pop = pop.loc[indices][['countryiso3code','value']]\
          .rename(columns = {'value':'population', 'countryiso3code': 'ISO-code'})

gdp = pd.read_xml("http://api.worldbank.org/v2/country/all/indicator/NY.GDP.PCAP.PP.CD?per_page=25000")
gdp = gdp.dropna(subset=['value'])
indices = gdp.groupby('countryiso3code').date.transform('idxmax').unique()
indices = pd.Series(indices).dropna().values # drop na
gdp = gdp.loc[indices][['countryiso3code','value']]\
          .rename(columns = {'value':'GDPc', 'countryiso3code': 'ISO-code'})

mort = pd.read_xml("http://api.worldbank.org/v2/country/all/indicator/SH.DYN.NMRT?per_page=25000")
mort = mort.dropna(subset=['value'])
indices = mort.groupby('countryiso3code').date.transform('idxmax').unique()
indices = pd.Series(indices).dropna().values # drop na
mort = mort.loc[indices][['countryiso3code','value']]\
          .rename(columns = {'value':'mort', 'countryiso3code': 'ISO-code'})

pop_grow = pd.read_xml("http://api.worldbank.org/v2/country/all/indicator/SP.POP.GROW?per_page=25000")
pop_grow = pop_grow.dropna(subset=['value'])
indices = pop_grow.groupby('countryiso3code').date.transform('idxmax').unique()
indices = pd.Series(indices).dropna().values # drop na
pop_grow = pop_grow.loc[indices][['countryiso3code','value']]\
          .rename(columns = {'value':'Population growth', 'countryiso3code': 'ISO-code'})

fert = pd.read_xml("http://api.worldbank.org/v2/country/all/indicator/SP.DYN.TFRT.IN?per_page=25000")
fert = fert.dropna(subset=['value'])
indices = fert.groupby('countryiso3code').date.transform('idxmax').unique()
indices = pd.Series(indices).dropna().values # drop na
fert = fert.loc[indices][['countryiso3code','value']]\
          .rename(columns = {'value':'Fertility', 'countryiso3code': 'ISO-code'})

life_exp = pd.read_xml("http://api.worldbank.org/v2/country/all/indicator/SP.DYN.LE00.IN?per_page=25000")
life_exp = life_exp.dropna(subset=['value'])
indices = life_exp.groupby('countryiso3code').date.transform('idxmax').unique()
indices = pd.Series(indices).dropna().values # drop na
life_exp = life_exp.loc[indices][['countryiso3code','value']]\
          .rename(columns = {'value':'Life expectancy', 'countryiso3code': 'ISO-code'})

suicide = pd.read_xml("http://api.worldbank.org/v2/country/all/indicator/SH.STA.SUIC.P5?per_page=25000")
suicide = suicide.dropna(subset=['value'])
indices = suicide.groupby('countryiso3code').date.transform('idxmax').unique()
indices = pd.Series(indices).dropna().values # drop na
suicide = suicide.loc[indices][['countryiso3code','value']]\
          .rename(columns = {'value':'Suicide rate', 'countryiso3code': 'ISO-code'})

In [4]:
data = pop.merge(gdp,on='ISO-code', how='inner')\
                .merge(mort,on='ISO-code', how='inner')\
                .merge(pop_grow.merge(fert,on='ISO-code', how='outer')\
                               .merge(life_exp,on='ISO-code', how='outer')\
                               .merge(suicide,on='ISO-code', how='outer')\
                               .merge(df_merged[['ISO-code','Meat consumption','Urbanization rate','Sex-ratio','Country']],on=['ISO-code'], how='inner')\
                       , how='inner')\
                .reset_index().drop(columns=['index'])

# getting the dev_index
min_max = MinMaxScaler()
two_col_scaled = pd.DataFrame(min_max.fit_transform(data[['GDPc','mort']]),columns=['GDPc','mort'])
one_over_x = Mean_pop_one_over_x()
one_over_x.fit(two_col_scaled.GDPc,two_col_scaled.mort,data.population)
data['dev_index'] = two_col_scaled.apply(lambda which_col: one_over_x.get_index_res(which_col.GDPc, which_col.mort), axis='columns', result_type='expand')[0]

pipe_kmeans = make_pipeline(
    MinMaxScaler(),
    KNNImputer(),
    KMeans(n_clusters = 5)
)
clusters = pipe_kmeans.fit_predict(data[['GDPc','mort']],kmeans__sample_weight = data['population'])
data['clusters'] = pd.Series(clusters, index = data.index)
data["clusters"] = data["clusters"].astype('category')\
                                  .cat.rename_categories([list(data.groupby('clusters').dev_index.mean().sort_values().index).index(i) for i in range(5)])\
                                  .cat.reorder_categories([0,1,2,3,4]).astype(int)



In [5]:
data.to_csv('data.csv')