In [78]:
import os

import pandas as pd
import numpy as np

# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs

# Kmeans algorithm from scikit-learn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [79]:
raw = pd.read_csv(r'C:\Users\fcars\Desktop\RSM338\python assignment 2\Country Risk 2019 Data.csv')

# check the raw data
print("Size of the dataset (row, col): ", raw.shape)
print("\nFirst 5 rows\n", raw.head(n=5))

Size of the dataset (row, col):  (121, 6)

First 5 rows
      Country Abbrev  Corruption  Peace  Legal  GDP Growth
0    Albania     AL          35  1.821  4.546       2.983
1    Algeria     DZ          35  2.219  4.435       2.553
2  Argentina     AR          45  1.989  5.087      -3.061
3    Armenia     AM          42  2.294  4.812       6.000
4  Australia     AU          77  1.419  8.363       1.713


In [80]:
print("\nSummary statistics\n", raw.describe())
print("\nCorrelation matrix\n", raw.corr())


Summary statistics
        Corruption       Peace       Legal  GDP Growth
count  121.000000  121.000000  121.000000  121.000000
mean    46.842975    2.001017    5.752529    2.657529
std     18.702499    0.461485    1.373932    2.563741
min     15.000000    1.072000    2.671000   -9.459000
25%     33.000000    1.699000    4.785000    1.249000
50%     41.000000    1.939000    5.455000    2.600000
75%     60.000000    2.294000    6.488000    4.000000
max     87.000000    3.369000    8.712000    7.800000

Correlation matrix
             Corruption     Peace     Legal  GDP Growth
Corruption    1.000000 -0.705002  0.938512   -0.123545
Peace        -0.705002  1.000000 -0.662233   -0.004428
Legal         0.938512 -0.662233  1.000000   -0.150369
GDP Growth   -0.123545 -0.004428 -0.150369    1.000000


In [81]:
X = raw[['Peace', 'Legal', 'GDP Growth']]
X = (X - X.mean()) / X.std()
print(X.head(5))

      Peace     Legal  GDP Growth
0 -0.390081 -0.878158    0.126952
1  0.472352 -0.958948   -0.040772
2 -0.026039 -0.484397   -2.230541
3  0.634871 -0.684553    1.303747
4 -1.261182  1.900001   -0.368418


In [96]:
# using default n_init (=10)
k = 3
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(X)

# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)

inertia for k=3 is 161.13338710052557
cluster centers:  [[ 1.22506036 -0.83385901 -1.07842464]
 [-0.85097477  1.02149992 -0.23897931]
 [ 0.23006626 -0.54045468  0.65506397]]
cluster labels:  [2 2 0 2 1 1 2 2 2 1 2 2 2 1 0 2 0 2 1 0 1 2 2 1 2 1 1 0 1 2 0 2 2 1 2 1 1
 2 2 1 2 2 2 2 1 1 2 2 0 1 2 1 1 1 1 2 2 1 1 1 0 0 1 2 2 1 2 2 1 0 2 2 2 2
 2 1 1 0 0 1 1 0 2 0 2 2 1 1 1 1 0 2 0 2 2 2 1 1 1 0 1 2 1 1 1 2 2 2 0 2 0
 2 0 1 1 1 1 2 0 2 0]


In [97]:
result = pd.DataFrame({'Country':raw['Country'], 'Abbrev':raw['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(result.sort_values('Label'))

                          Country Abbrev  Label
60                        Lebanon     LB      0
30                        Ecuador     EC      0
48                           Iran     IR      0
61                        Liberia     LR      0
69                         Mexico     MX      0
77                      Nicaragua     NI      0
78                        Nigeria     NG      0
81                       Pakistan     PK      0
83                       Paraguay     PY      0
90                         Russia     RU      0
92                   Saudi Arabia     SA      0
99                   South Africa     ZA      0
108           Trinidad and Tobago     TT      0
110                        Turkey     TR      0
112                       Ukraine     UA      0
118                         Yemen     YE      0
27   Democratic Republic of Congo     CD      0
19                           Chad     TD      0
120                      Zimbabwe     ZW      0
14                         Brazil     BR

In [98]:
#count labels when n_init is in default 
n_init_default = result.groupby("Label")[["Label"]].agg(['count'])
print(n_init_default)

      Label
      count
Label      
0        22
1        46
2        53


# (B) Changing n_init to 2, 5, 11, 12

# When n_init = 2

In [99]:
# using n_init = 2
k = 3
kmeans = KMeans(n_clusters=k, n_init=2, random_state=1)
kmeans.fit(X)

# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)

inertia for k=3 is 169.24242908631018
cluster centers:  [[ 0.53110654 -0.61456608  0.34774502]
 [-0.85103491  0.99692377 -0.22524313]
 [ 0.70529573 -0.95894794 -3.43893096]]
cluster labels:  [0 0 2 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1
 0 0 1 0 0 0 0 1 1 0 0 2 1 0 1 1 1 1 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0
 0 1 1 2 0 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 0 0 0 0 0 0
 0 0 1 1 1 1 0 0 0 2]


In [102]:
result = pd.DataFrame({'Country':raw['Country'], 'Abbrev':raw['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(result.sort_values('Label'))

                          Country Abbrev  Label
0                         Albania     AL      0
81                       Pakistan     PK      0
78                        Nigeria     NG      0
74                          Nepal     NP      0
73                     Mozambique     MZ      0
72                        Morocco     MA      0
71                     Montenegro     ME      0
70                        Moldova     FM      0
69                         Mexico     MX      0
82                         Panama     PA      0
67                     Mauritania     MR      0
64                         Malawi     MW      0
63                     Madagascar     MG      0
61                        Liberia     LR      0
119                        Zambia     ZM      0
56                          Kenya     KE      0
55                     Kazakhstan     KZ      0
50                         Israel     IL      0
47                      Indonesia     ID      0
66                           Mali     ML

In [103]:
n_init_two = result.groupby("Label")[["Label"]].agg(['count'])
print(n_init_two)

      Label
      count
Label      
0        70
1        47
2         4


# When n_init = 5

In [104]:
# using n_init = 5
k = 3
kmeans = KMeans(n_clusters=k, n_init=5, random_state=1)
kmeans.fit(X)

# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)

inertia for k=3 is 161.2553773419662
cluster centers:  [[ 1.21777164 -0.67632275 -0.90441375]
 [-0.85097477  1.02149992 -0.23897931]
 [ 0.15270973 -0.60009398  0.70424093]]
cluster labels:  [2 2 0 2 1 1 0 0 2 1 2 2 2 1 0 2 0 2 1 0 1 2 0 1 2 1 1 0 1 2 0 2 2 1 2 1 1
 2 2 1 2 2 2 2 1 1 2 2 0 1 0 1 1 1 1 2 2 1 1 1 0 0 1 2 2 1 2 2 1 0 2 2 2 2
 2 1 1 0 0 1 1 0 2 0 2 2 1 1 1 1 0 2 0 2 2 2 1 1 1 0 1 2 1 1 1 2 2 2 0 2 0
 2 0 1 1 1 1 2 0 2 0]


In [105]:
result = pd.DataFrame({'Country':raw['Country'], 'Abbrev':raw['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(result.sort_values('Label'))

                          Country Abbrev  Label
60                        Lebanon     LB      0
30                        Ecuador     EC      0
48                           Iran     IR      0
50                         Israel     IL      0
61                        Liberia     LR      0
69                         Mexico     MX      0
77                      Nicaragua     NI      0
78                        Nigeria     NG      0
27   Democratic Republic of Congo     CD      0
81                       Pakistan     PK      0
90                         Russia     RU      0
92                   Saudi Arabia     SA      0
99                   South Africa     ZA      0
108           Trinidad and Tobago     TT      0
110                        Turkey     TR      0
112                       Ukraine     UA      0
118                         Yemen     YE      0
83                       Paraguay     PY      0
22                       Colombia     CO      0
120                      Zimbabwe     ZW

In [106]:
n_init_five = result.groupby("Label")[["Label"]].agg(['count'])
print(n_init_five)

      Label
      count
Label      
0        26
1        46
2        49


# When n_init = 11

In [107]:
# using n_init = 12
k = 3
kmeans = KMeans(n_clusters=k, n_init=11, random_state=1)
kmeans.fit(X)

# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)

inertia for k=3 is 161.13338710052557
cluster centers:  [[ 1.22506036 -0.83385901 -1.07842464]
 [-0.85097477  1.02149992 -0.23897931]
 [ 0.23006626 -0.54045468  0.65506397]]
cluster labels:  [2 2 0 2 1 1 2 2 2 1 2 2 2 1 0 2 0 2 1 0 1 2 2 1 2 1 1 0 1 2 0 2 2 1 2 1 1
 2 2 1 2 2 2 2 1 1 2 2 0 1 2 1 1 1 1 2 2 1 1 1 0 0 1 2 2 1 2 2 1 0 2 2 2 2
 2 1 1 0 0 1 1 0 2 0 2 2 1 1 1 1 0 2 0 2 2 2 1 1 1 0 1 2 1 1 1 2 2 2 0 2 0
 2 0 1 1 1 1 2 0 2 0]


In [108]:
result = pd.DataFrame({'Country':raw['Country'], 'Abbrev':raw['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(result.sort_values('Label'))

                          Country Abbrev  Label
60                        Lebanon     LB      0
30                        Ecuador     EC      0
48                           Iran     IR      0
61                        Liberia     LR      0
69                         Mexico     MX      0
77                      Nicaragua     NI      0
78                        Nigeria     NG      0
81                       Pakistan     PK      0
83                       Paraguay     PY      0
90                         Russia     RU      0
92                   Saudi Arabia     SA      0
99                   South Africa     ZA      0
108           Trinidad and Tobago     TT      0
110                        Turkey     TR      0
112                       Ukraine     UA      0
118                         Yemen     YE      0
27   Democratic Republic of Congo     CD      0
19                           Chad     TD      0
120                      Zimbabwe     ZW      0
14                         Brazil     BR

In [110]:
n_init_eleven = result.groupby("Label")[["Label"]].agg(['count'])
print(n_init_eleven)

      Label
      count
Label      
0        22
1        46
2        53


# When n_init = 12

In [111]:
# using n_init = 12
k = 3
kmeans = KMeans(n_clusters=k, n_init=12, random_state=1)
kmeans.fit(X)

# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)

inertia for k=3 is 161.13338710052557
cluster centers:  [[ 1.22506036 -0.83385901 -1.07842464]
 [-0.85097477  1.02149992 -0.23897931]
 [ 0.23006626 -0.54045468  0.65506397]]
cluster labels:  [2 2 0 2 1 1 2 2 2 1 2 2 2 1 0 2 0 2 1 0 1 2 2 1 2 1 1 0 1 2 0 2 2 1 2 1 1
 2 2 1 2 2 2 2 1 1 2 2 0 1 2 1 1 1 1 2 2 1 1 1 0 0 1 2 2 1 2 2 1 0 2 2 2 2
 2 1 1 0 0 1 1 0 2 0 2 2 1 1 1 1 0 2 0 2 2 2 1 1 1 0 1 2 1 1 1 2 2 2 0 2 0
 2 0 1 1 1 1 2 0 2 0]


In [112]:
result = pd.DataFrame({'Country':raw['Country'], 'Abbrev':raw['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(result.sort_values('Label'))

                          Country Abbrev  Label
60                        Lebanon     LB      0
30                        Ecuador     EC      0
48                           Iran     IR      0
61                        Liberia     LR      0
69                         Mexico     MX      0
77                      Nicaragua     NI      0
78                        Nigeria     NG      0
81                       Pakistan     PK      0
83                       Paraguay     PY      0
90                         Russia     RU      0
92                   Saudi Arabia     SA      0
99                   South Africa     ZA      0
108           Trinidad and Tobago     TT      0
110                        Turkey     TR      0
112                       Ukraine     UA      0
118                         Yemen     YE      0
27   Democratic Republic of Congo     CD      0
19                           Chad     TD      0
120                      Zimbabwe     ZW      0
14                         Brazil     BR

In [113]:
n_init_twelve = result.groupby("Label")[["Label"]].agg(['count'])
print(n_init_twelve)

      Label
      count
Label      
0        22
1        46
2        53


# (C) Comparing 4 features with 3 features both with n_init = default 

In [115]:
# setting back n to default
X = raw[['Peace', 'Legal', 'GDP Growth','Corruption']]
X = (X - X.mean()) / X.std()

k = 3
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(X)

# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)



inertia for k=3 is 194.4046655009297
cluster centers:  [[-0.89877793  1.12417837 -0.26007806  1.17949284]
 [ 0.17066495 -0.47838646  0.5929059  -0.49863571]
 [ 1.22506036 -0.83385901 -1.07842464 -0.88356071]]
cluster labels:  [1 1 2 1 0 0 1 1 1 0 1 1 1 0 2 1 2 1 0 2 0 1 1 0 1 0 0 2 0 1 2 1 1 0 1 0 0
 1 1 0 1 1 1 1 1 0 1 1 2 0 1 0 1 0 1 1 1 0 1 0 2 2 0 1 1 0 1 1 0 2 1 1 1 1
 1 0 0 2 2 0 0 2 1 2 1 1 0 0 0 1 2 1 2 1 1 1 0 0 0 2 0 1 0 0 0 1 1 1 2 1 2
 1 2 0 0 0 0 1 2 1 2]


In [116]:
result = pd.DataFrame({'Country':raw['Country'], 'Abbrev':raw['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(result.sort_values('Label'))

                          Country Abbrev  Label
51                          Italy     IT      0
33                        Estonia     EE      0
35                        Finland     FI      0
36                         France     FR      0
39                        Germany     DE      0
45                        Iceland     IS      0
49                        Ireland     IE      0
53                          Japan     JP      0
57                  Korea (South)     KI      0
59                         Latvia     LV      0
62                      Lithuania     LT      0
65                       Malaysia     MY      0
68                      Mauritius     MU      0
75                    Netherlands     NL      0
76                    New Zealand     NZ      0
79                         Norway     NO      0
80                           Oman     OM      0
86                         Poland     PL      0
87                       Portugal     PT      0
88                          Qatar     QA

In [117]:
n_init_default_four = result.groupby("Label")[["Label"]].agg(['count'])
print(n_init_default_four)

      Label
      count
Label      
0        41
1        58
2        22


# (D) Using hierarchical clustering and compare results with k means 

In [118]:
from sklearn.cluster import AgglomerativeClustering as alc
import scipy.cluster.hierarchy as sch

# When linkage = ward

In [147]:
with_ward = alc(n_clusters = 3,affinity = 'euclidean',linkage = 'ward' )
with_ward.fit(X)
raw ["label"] = with_ward.labels_
group_by_ward = raw.groupby("label")[['Legal','Peace','GDP Growth']].agg(['mean','count'])
print(group_by_ward)

          Legal           Peace       GDP Growth      
           mean count      mean count       mean count
label                                                 
0      6.922200    55  1.699200    55   1.852145    55
1      4.179357    14  2.812429    14  -0.140500    14
2      4.938923    52  2.101788    52   4.262692    52


# When linkage = complete

In [145]:
with_ward = alc(n_clusters = 3,affinity = 'euclidean',linkage = 'complete' )
with_ward.fit(X)
raw ["label"] = with_ward.labels_
group_by_ward = raw.groupby("label")[['Legal','Peace','GDP Growth']].agg(['mean','count'])
print(group_by_ward)

          Legal           Peace       GDP Growth      
           mean count      mean count       mean count
label                                                 
0      4.762949    59  2.243814    59   4.005712    59
1      6.820153    59  1.735949    59   1.810153    59
2      4.217667     3  2.439000     3  -7.191667     3


# When linkage = average

In [146]:
with_ward = alc(n_clusters = 3,affinity = 'euclidean',linkage = 'average' )
with_ward.fit(X)
raw ["label"] = with_ward.labels_
group_by_ward = raw.groupby("label")[['Legal','Peace','GDP Growth']].agg(['mean','count'])
print(group_by_ward)

          Legal           Peace       GDP Growth      
           mean count      mean count       mean count
label                                                 
0      7.159522    46  1.611413    46   2.175761    46
1      4.435000     4  2.326500     4  -6.159000     4
2      4.915183    71  2.235099    71   3.466366    71


# (E) Insert Venezueala's index information and analyze outliers 

In [121]:
ve = pd.DataFrame({'Country':['Venezuela'],
                  'Abbrev':['VE'],
                  'Corruption':[16],
                  'Peace':[2.671],
                  'Legal':[2.895],
                  'GDP Growth':[-35],},index = [121])
print(ve)

       Country Abbrev  Corruption  Peace  Legal  GDP Growth
121  Venezuela     VE          16  2.671  2.895         -35


In [138]:
raw_with_ve = raw.append(ve)
print(raw_with_ve)

       Country Abbrev  Corruption  Peace  Legal  GDP Growth  label
0      Albania     AL          35  1.821  4.546       2.983    2.0
1      Algeria     DZ          35  2.219  4.435       2.553    2.0
2    Argentina     AR          45  1.989  5.087      -3.061    1.0
3      Armenia     AM          42  2.294  4.812       6.000    2.0
4    Australia     AU          77  1.419  8.363       1.713    0.0
..         ...    ...         ...    ...    ...         ...    ...
117    Vietnam     VI          37  1.877  5.084       6.500    2.0
118      Yemen     YE          15  3.369  2.671       2.113    2.0
119     Zambia     ZM          34  1.805  4.592       2.021    2.0
120   Zimbabwe     ZW          24  2.463  3.738      -7.077    1.0
121  Venezuela     VE          16  2.671  2.895     -35.000    NaN

[122 rows x 7 columns]


In [139]:
Y = raw_with_ve[['Peace', 'Legal', 'GDP Growth']]
Y = (Y - Y.mean()) / Y.std()
print(X.head(5))

      Peace     Legal  GDP Growth  Corruption
0 -0.390081 -0.878158    0.126952   -0.633230
1  0.472352 -0.958948   -0.040772   -0.633230
2 -0.026039 -0.484397   -2.230541   -0.098542
3  0.634871 -0.684553    1.303747   -0.258948
4 -1.261182  1.900001   -0.368418    1.612460


In [140]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(Y)

# print inertia & cluster center
print("inertia for k=3 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)

inertia for k=3 is 147.3581596115833
cluster centers:  [[ 0.50083526 -0.57899064  0.14262782]
 [ 1.4334541  -2.03528534 -8.76865913]
 [-0.9090402   1.05949011 -0.05031098]]
cluster labels:  [0 0 0 0 2 2 0 0 0 2 0 0 0 2 0 2 0 0 2 0 2 0 0 2 0 2 2 0 2 0 0 0 0 2 0 2 2
 0 0 2 0 0 0 0 2 2 0 0 0 2 0 2 0 2 0 0 0 2 0 2 0 0 2 0 0 2 0 0 2 0 0 0 0 0
 0 2 2 0 0 2 2 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 2 2 2 0 2 0 2 2 2 0 0 0 0 0 0
 0 0 2 2 2 2 0 0 0 0 1]


In [142]:
result_with_ve = pd.DataFrame({'Country':raw_with_ve['Country'], 'Abbrev':raw_with_ve['Abbrev'], 'Label':y})
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(result_with_ve.sort_values('Label'))

                          Country Abbrev  Label
0                         Albania     AL      0
78                        Nigeria     NG      0
77                      Nicaragua     NI      0
74                          Nepal     NP      0
73                     Mozambique     MZ      0
72                        Morocco     MA      0
71                     Montenegro     ME      0
70                        Moldova     FM      0
69                         Mexico     MX      0
67                     Mauritania     MR      0
81                       Pakistan     PK      0
66                           Mali     ML      0
63                     Madagascar     MG      0
61                        Liberia     LR      0
120                      Zimbabwe     ZW      0
58                         Kuwait     KW      0
56                          Kenya     KE      0
55                     Kazakhstan     KZ      0
54                         Jordan     JO      0
52                        Jamaica     JM

In [143]:
count_with_ve = result_with_ve.groupby("Label")[["Label"]].agg(['count'])
print(count_with_ve)

      Label
      count
Label      
0        77
1         1
2        44
