In [1]:
import os
import pandas as pd
import numpy as np
import kagglehub

import plotly.express as px

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# path = kagglehub.dataset_download("rohan0301/unsupervised-learning-on-country-data")
# os.rename(path, './datasets')

In [2]:
df = pd.read_csv('./datasets/Country-data.csv')

In [3]:
df.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200


In [4]:
df.describe()

Unnamed: 0,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
count,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0
mean,38.27006,41.108976,6.815689,46.890215,17144.688623,7.781832,70.555689,2.947964,12964.155689
std,40.328931,27.41201,2.746837,24.209589,19278.067698,10.570704,8.893172,1.513848,18328.704809
min,2.6,0.109,1.81,0.0659,609.0,-4.21,32.1,1.15,231.0
25%,8.25,23.8,4.92,30.2,3355.0,1.81,65.3,1.795,1330.0
50%,19.3,35.0,6.32,43.3,9960.0,5.39,73.1,2.41,4660.0
75%,62.1,51.35,8.6,58.75,22800.0,10.75,76.8,3.88,14050.0
max,208.0,200.0,17.9,174.0,125000.0,104.0,82.8,7.49,105000.0


In [None]:
# corr = df.corr()

# import seaborn as sns

# plt.figure()

# sns.heatmap(corr, cmap='coolwarm')

# plt.show()

In [5]:
X = df.drop('country', axis=1)

In [6]:
scal = StandardScaler()

X_scal = scal.fit_transform(X)

In [7]:
pca = PCA()
pca.fit(X_scal)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [8]:
pca.components_.shape

(9, 9)

In [9]:
variances = pca.explained_variance_ratio_
variances

array([0.4595174 , 0.17181626, 0.13004259, 0.11053162, 0.07340211,
       0.02484235, 0.0126043 , 0.00981282, 0.00743056])

In [10]:
cum_variance = np.cumsum(variances)
cum_variance

array([0.4595174 , 0.63133365, 0.76137624, 0.87190786, 0.94530998,
       0.97015232, 0.98275663, 0.99256944, 1.        ])

In [11]:
fig = px.bar(cum_variance)

fig.write_image('./cum_var.png')

In [12]:
pca = PCA(n_components = 2)

X_red = pca.fit_transform(X_scal)
df_red = pd.DataFrame(X_red)

In [13]:
scn = DBSCAN(eps=1,min_samples=2)
scn.fit(X_scal)
df_red['scn_cluster'] = scn.labels_

In [14]:
scn.labels_

array([ 0,  1, -1, -1,  1, -1,  1,  2,  2, -1,  1, -1,  3,  1, -1,  4,  1,
        5,  1,  1,  1, -1,  1,  6,  1,  5, -1,  1,  5,  2,  1, -1,  7,  1,
        1,  1,  8, -1, -1,  1,  0,  1,  1,  1,  2,  1,  1,  1,  1, -1, -1,
        1,  1,  2,  2, -1,  8,  1,  2, -1,  2,  1,  1,  0,  5,  1, -1,  1,
        2,  3, -1,  1, -1, -1, -1,  2,  1,  2, -1, -1,  8, -1,  6,  1, -1,
        1,  1, -1, -1, -1,  1, -1,  1,  8,  0, -1,  1,  7, -1, -1,  1, -1,
       -1, -1,  1,  1,  0, -1, -1,  9,  4,  2,  7, -1, -1, 10, -1,  1,  1,
        1,  1,  1,  2, -1,  1,  1, -1, 11, 10,  8,  1, -1, -1, -1,  1,  1,
       -1, -1,  1,  2, -1,  1, -1,  1,  2, -1, -1,  0, -1, -1, -1, 11,  1,
        1, -1,  0, -1, -1,  2, -1,  1,  9,  1, -1, -1, -1,  0])

In [15]:
km = KMeans(n_clusters=10)
km.fit(X_scal)
df_red['km_cluster'] = km.labels_

In [16]:
fig = px.scatter(df_red, x=0, y=1, color='scn_cluster')

fig.write_image('./clustered.png')

In [17]:
tsn = TSNE()

Xt = tsn.fit_transform(X_scal)

In [18]:
Xtz = list(zip(*Xt))

In [19]:
X['f1'] = list(Xtz[0])
X['f2'] = list(Xtz[1])

In [20]:
fig = px.scatter(X, x='f1', y='f2')

fig.write_image('./tsne.png')