In [1]:
from sklearn import * 
import pandas as pd 
import numpy as np 

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 

In [2]:
def print_full(x):
    pd.set_option('display.max_rows',len(x))
    print(x)
    pd.reset_option('display.max_rows')
    
data = pd.read_csv('Kaggle.csv')
ids = pd.DataFrame(data['Id'])
del data['Id']

min_max_scaler = preprocessing.MinMaxScaler()
original_data = min_max_scaler.fit_transform(data)
original_data

array([[1.00000000e+00, 4.90842491e-02, 3.53476481e-02, ...,
        4.64763791e-01, 9.79020979e-04, 4.83675937e-03],
       [9.85025710e-01, 2.24420024e-01, 5.60409851e-02, ...,
        3.63749932e-01, 1.32867133e-03, 1.20918984e-02],
       [9.76051726e-01, 1.83882784e-01, 6.28391748e-03, ...,
        1.66000924e-01, 1.53846154e-03, 1.33010883e-02],
       ...,
       [7.15967010e-02, 2.91330891e-01, 3.16708461e-01, ...,
        2.25768189e-01, 3.21678322e-02, 2.89600967e-01],
       [3.15078273e-03, 7.68742369e-01, 4.78601375e-01, ...,
        1.60820056e-01, 3.49650350e-01, 8.29504232e-01],
       [0.00000000e+00, 1.54822955e-01, 1.00000000e+00, ...,
        1.92908165e-01, 1.11888112e-01, 6.17896010e-01]])

We can cluster the countries based on the original attribute  

In [3]:
ds = KMeans(n_clusters = 3).fit(original_data)
clustered = pd.DataFrame(ids)
clustered['clusters'] = ds.labels_[:,np.newaxis]
print_full(clustered)

                                   Id  clusters
0                              Norway         2
1                           Australia         2
2                         Switzerland         2
3                             Denmark         2
4                         Netherlands         2
5                             Germany         2
6                             Ireland         2
7                       United States         2
8                              Canada         2
9                         New Zealand         2
10                          Singapore         2
11                         Hong Kong          2
12                      Liechtenstein         2
13                             Sweden         2
14                     United Kingdom         2
15                            Iceland         2
16                        South Korea         2
17                             Israel         2
18                         Luxembourg         2
19                              Japan   

# We can run the PCA and extract one component and then clusters 

In [4]:
pca = PCA(n_components = 1)
pca.fit(original_data)
transf_dataset = pca.transform(original_data)
print(pca.explained_variance_ratio_)

[0.46397101]


In [5]:
Q = transf_dataset
clustered['PCA_1'] = Q
ds = KMeans(n_clusters = 3).fit(Q)
clustered['clusters_PCA'] = ds.labels_
print_full(clustered)

                                   Id  clusters     PCA_1  clusters_PCA
0                              Norway         2 -1.387436             1
1                           Australia         2 -1.500972             1
2                         Switzerland         2 -1.329357             1
3                             Denmark         2 -1.480184             1
4                         Netherlands         2 -1.445481             1
5                             Germany         2 -1.385511             1
6                             Ireland         2 -1.259065             1
7                       United States         2 -1.400846             1
8                              Canada         2 -1.258965             1
9                         New Zealand         2 -1.238265             1
10                          Singapore         2 -1.295481             1
11                         Hong Kong          2 -1.378178             1
12                      Liechtenstein         2 -1.159481       

187                             Niger         1  1.879859             2


In [6]:
siho = metrics.silhouette_samples(data,ds.labels_)
print('Silhouette Coefficient : %0.3f'% metrics.silhouette_score(Q,clustered['clusters_PCA']))
print('Silhouette Coefficient : %0.3f'% metrics.silhouette_score(original_data,
                                                                 clustered['clusters']))

Silhouette Coefficient : 0.604
Silhouette Coefficient : 0.197


Analyzing the results, I will use the PCA clustering results

In [7]:
reporting_table = pd.concat((data,clustered) , axis = 1)
grouped = reporting_table.groupby('clusters_PCA').mean()
print(grouped)

              Human Development Index HDI-2014  Gini coefficient 2005-2013  \
clusters_PCA                                                                 
0                                     0.698451                   41.952480   
1                                     0.847652                   32.441144   
2                                     0.484478                   40.951154   

              Adolescent birth rate 15-19 per 100k 20102015  \
clusters_PCA                                                  
0                                                 47.915652   
1                                                 17.845308   
2                                                 91.752558   

              Birth registration funder age 5 2005-2013  \
clusters_PCA                                              
0                                             91.821256   
1                                             99.636816   
2                                             55.185897  

In [8]:
pd.set_option('display.max_columns' , 500)
pd.set_option('display.width' ,1000)
print_full(grouped.transpose())

clusters_PCA                                                   0             1             2
Human Development Index HDI-2014                        0.698451      0.847652      0.484478
Gini coefficient 2005-2013                             41.952480     32.441144     40.951154
Adolescent birth rate 15-19 per 100k 20102015          47.915652     17.845308     91.752558
Birth registration funder age 5 2005-2013              91.821256     99.636816     55.185897
Carbon dioxide emissionsAverage annual growth           3.254157      0.860438      1.203288
Carbon dioxide emissions per capita 2011 Tones          3.498879      9.203675      0.369870
Change forest percentable 1900 to 2012                  0.380995     15.774132    -15.684736
Change mobile usage 2009 2014                          41.562029     17.779254    130.407500
Consumer price index 2013                             119.226886    111.669963    125.935747
Domestic credit provided by financial sector 2013      59.074095    11