In [1]:
from time import time
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn import metrics
import itertools
from pandas import MultiIndex


## Cleaned Data importing

In [2]:
# reading the data
data = pd.read_csv("data/clean_data.csv", sep=',')
data = data.replace([np.inf, -np.inf], np.nan).dropna()
lep = preprocessing.LabelEncoder()
data['position']=lep.fit_transform(data['position'])
print("data shape = ")
print(data.shape)


data shape = 
(1564, 19)


In [3]:
# Drop birthdate (Birthday is not related)
data = data.drop('birthday', axis=1)
data.head()


Unnamed: 0.1,Unnamed: 0,playerShort,yellowReds,goals,ties,leagueCountry,black,defeats,weight,victories,height,meanExp,yellowCards,games,redCards,position,meanIAT,club
0,0,0,0,9,179,0,0,228,71.0,247,182.0,0.494575,19,654,0,1,0.346459,34
1,1,1,0,62,73,2,0,122,73.0,141,183.0,0.44922,42,336,1,0,0.348818,91
2,2,2,0,31,97,0,0,115,63.0,200,165.0,0.491482,11,412,0,10,0.345893,83
3,3,3,0,39,42,0,0,68,76.0,150,178.0,0.514693,31,260,1,3,0.346821,6
4,4,4,4,1,40,1,0,43,73.0,41,180.0,0.335587,8,124,2,1,0.3316,51


Because it is an unsupervised learning, we assume that we don't have the labels, so we remove the black labels from the main data, and keep them for comparing the final result.

In [4]:
# labels, we save the real lables 
labels = np.array(data['black'])

#remove the labels from the data
data = data.drop('black', axis=1)


bench_k_means function get the data, and apply the k-means estimator that we pass to it, and calculate the silhouette_score, homogeneity_score, and completeness_score.

In [5]:
def bench_k_means(estimator, name, data):
    t0 = time()
    res = estimator.fit(data)
    si = metrics.silhouette_score(data, estimator.labels_,metric='euclidean')
    hom = metrics.homogeneity_score(labels, estimator.labels_)
    com = metrics.completeness_score(labels, estimator.labels_)
    #print('% 9s   %.2fs    %i   %.3f   %.3f  %.3f'
    #      % (name, (time() - t0), estimator.inertia_,
    #         metrics.homogeneity_score(labels, estimator.labels_),
    #         metrics.completeness_score(labels, estimator.labels_),
    #         metrics.silhouette_score(data, estimator.labels_,
    #                                  metric='euclidean')))
    
    return si, hom, com


We compare K-means++ and K-means, to see the differenc, and as it is shown by the silhouette_score, there is no difference between them. 

In [6]:
kmeans_plus = bench_k_means(KMeans(init='k-means++', n_clusters=2, n_init=10),
              name="k-means++", data=data)
kmeans = bench_k_means(KMeans(init='random', n_clusters=2, n_init=10),
              name="k-means-random", data=data)

print(kmeans_plus)
print(kmeans)

(0.56094157753082929, 0.00069558954719075477, 0.00045456376787869455)
(0.56094157753082929, 0.00069558954719075477, 0.00045456376787869455)


In [7]:
def drop_features(data, cols): #drop different columns and see the effect
    dropped_data = data.drop(cols, axis=1)
    si, hom, com = bench_k_means(KMeans(init='k-means++', n_clusters=2, n_init=10),
              name="k-means++", data=dropped_data)
    return si, hom, com

We dropped all the possible combinations of the 2 features, and choos the best score among them. 

In [18]:
max_si = kmeans_plus[0]
for subset in itertools.combinations(data.columns, 2):
    # print ('dropped: ', subset)
    si, hom, com = drop_features(data, [subset[0],subset[1]])  
    if(max_si < si):
        max_si = si
        print('dropped: ', subset)
        print('silhouette_score: ', si)
        print('homogeneity_score:', hom)
        print('completeness_score', com)
    

dropped:  ('Unnamed: 0', 'games')
silhouette_score:  0.571057786853
homogeneity_score: 0.000695589547191
completeness_score 0.000454563767879
dropped:  ('yellowReds', 'games')
silhouette_score:  0.592333999881
homogeneity_score: 0.000792775006818
completeness_score 0.000518061073138
dropped:  ('goals', 'games')
silhouette_score:  0.595896698215
homogeneity_score: 0.000792775006818
completeness_score 0.000518061073138
dropped:  ('defeats', 'games')
silhouette_score:  0.596267631583
homogeneity_score: 0.000792775006818
completeness_score 0.000518061073138
dropped:  ('victories', 'games')
silhouette_score:  0.605126224991
homogeneity_score: 0.000792775006818
completeness_score 0.000518061073138


As we can see in the result the homogenity and completeness scores are really low. It shows that the clustering for the colors based on these features don't work. Moreover, the maximum silhouette_score we achieved was ~0.6. 
We did the same thing for all the 3 features combinations.

In [None]:
for subset in itertools.combinations(data.columns, 3):
    # print ('dropped: ', subset)
    si, hom, com = drop_features(data, [subset[0],subset[1],subset[2]])  
    if(max_si < si):
        max_si = si
        print('dropped: ', subset)
        print('silhouette_score: ', si)
        print('homogeneity_score:', hom)
        print('completeness_score', com)

dropped:  ('yellowReds', 'victories', 'games')
silhouette_score:  0.605132313149
homogeneity_score: 0.000792775006818
completeness_score 0.000518061073138
dropped:  ('goals', 'victories', 'games')
silhouette_score:  0.609583311005
homogeneity_score: 0.000792775006818
completeness_score 0.000518061073138
dropped:  ('defeats', 'victories', 'games')
silhouette_score:  0.609851624132
homogeneity_score: 0.000792775006818
completeness_score 0.000518061073138


But as we can see, we couldn't get a significantly better result. The clustering doesn't work for it. 