## Import Dependencies

In [1]:
from sklearn import cluster
import pandas as pd
import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score

In [2]:
df = pd.read_csv('CompleteDataset.csv').fillna(0)

  interactivity=interactivity, compiler=compiler, result=result)


## Build dataset

#### Take a look at the initial dataset

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
0,0,Cristiano Ronaldo,32,https://cdn.sofifa.org/48/18/players/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,...,61.0,53.0,82.0,62.0,91.0,89.0,92.0,91.0,66.0,92.0
1,1,L. Messi,30,https://cdn.sofifa.org/48/18/players/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,93,93,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,...,57.0,45.0,84.0,59.0,92.0,90.0,88.0,91.0,62.0,88.0
2,2,Neymar,25,https://cdn.sofifa.org/48/18/players/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,94,Paris Saint-Germain,https://cdn.sofifa.org/24/18/teams/73.png,...,59.0,46.0,79.0,59.0,88.0,87.0,84.0,89.0,64.0,84.0
3,3,L. Suárez,30,https://cdn.sofifa.org/48/18/players/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,92,92,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,...,64.0,58.0,80.0,65.0,88.0,85.0,88.0,87.0,68.0,88.0
4,4,M. Neuer,31,https://cdn.sofifa.org/48/18/players/167495.png,Germany,https://cdn.sofifa.org/flags/21.png,92,92,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Check for duplicates

In [4]:
df['ID'].value_counts().head()

210679    2
212153    2
201878    2
225199    2
204923    2
Name: ID, dtype: int64

#### Remove duplicates

In [5]:
df.drop(df.columns[0],axis=1,inplace=True)

In [6]:
df.shape

(17981, 74)

In [7]:
df = df.drop_duplicates()

In [9]:
df['ID'].value_counts().head()

231423    1
213661    1
237430    1
230037    1
223896    1
Name: ID, dtype: int64

In [8]:
df.shape

(17929, 74)

In [10]:
df.columns

Index(['Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall', 'Potential',
       'Club', 'Club Logo', 'Value', 'Wage', 'Special', 'Acceleration',
       'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure',
       'Crossing', 'Curve', 'Dribbling', 'Finishing', 'Free kick accuracy',
       'GK diving', 'GK handling', 'GK kicking', 'GK positioning',
       'GK reflexes', 'Heading accuracy', 'Interceptions', 'Jumping',
       'Long passing', 'Long shots', 'Marking', 'Penalties', 'Positioning',
       'Reactions', 'Short passing', 'Shot power', 'Sliding tackle',
       'Sprint speed', 'Stamina', 'Standing tackle', 'Strength', 'Vision',
       'Volleys', 'CAM', 'CB', 'CDM', 'CF', 'CM', 'ID', 'LAM', 'LB', 'LCB',
       'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB', 'Preferred Positions',
       'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM', 'RS', 'RW', 'RWB', 'ST'],
      dtype='object')

In [11]:
df['pref_pos'] = df['Preferred Positions'].str.split()

#### Names + id

In [12]:
players_df = df[['Name', 'ID']]

#### Create positions dictionary

In [13]:
pos_dict = {
    'fw': ['ST', 'LW', 'RW', 'SS'],
    'gk': ['GK'],
    'mf': ['DM', 'CM', 'CDM', 'CAM', 'RM', 'LM',],
    'def': ['CB', 'RB', 'LB', 'RWB', 'LWB']
}

#### Insert positions

In [14]:
def get_positions(pos_list, pos_dict_list):
    for pos in pos_dict_list:
        if pos in pos_list:
            return 1

In [15]:
def insert_positions(df, pos_dict):
    for pos, pos_dict_list in pos_dict.items():
        df[pos] = df['pref_pos'].apply(lambda x: get_positions(x, pos_dict_list)).fillna(0)

In [16]:
insert_positions(df, pos_dict)

In [17]:
df[['pref_pos', 'fw', 'gk', 'mf', 'def']].head()

Unnamed: 0,pref_pos,fw,gk,mf,def
0,"[ST, LW]",1.0,0.0,0.0,0.0
1,[RW],1.0,0.0,0.0,0.0
2,[LW],1.0,0.0,0.0,0.0
3,[ST],1.0,0.0,0.0,0.0
4,[GK],0.0,1.0,0.0,0.0


#### Define stats for each position

In [18]:
global_stats = ['CAM', 'CB', 'CDM', 'CF', 'CM',
       'LAM', 'LB', 'LCB', 'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB',
       'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM',
       'RS', 'RW', 'RWB', 'ST']

In [19]:
stats_dict = {}

stats_dict['fw'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Curve', 'Dribbling', 
    'Finishing', 'Heading accuracy', 'Jumping', 'Long shots','Positioning', 'Crossing', 'Long passing', 
    'Shot power', 'Sprint speed', 'Stamina', 'Strength', 'Vision'
]

stats_dict['mf'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Curve', 'Dribbling',
    'Long shots', 'Positioning', 'Crossing', 'Long passing', 'Shot power', 'Sprint speed', 'Stamina', 
    'Strength', 'Vision', 'Sliding tackle', 'Standing tackle', 'Marking'
]

stats_dict['def'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Positioning', 'Long passing',
    'Sprint speed', 'Stamina', 'Strength', 'Vision', 'Sliding tackle', 'Standing tackle', 'Marking',
    'Jumping', 'Heading accuracy'
]

stats_dict['gk'] = [col for col in df.columns if 'GK' in col] + ['Reactions', 'Volleys']

#### Get dict with positions dfs

In [20]:
df_dict = {}

for pos, pos_stats in stats_dict.items():
    cols = ['ID'] + pos_stats + global_stats
    df_dict[pos] = df[df[pos] > 0][cols].set_index('ID').fillna(0)

#### Clean dataset

In [21]:
def remove_signs(val):
    if isinstance(val, int) or isinstance(val, float):
        return val
    if '+' in val:
        vals = val.split('+')
        return int(vals[0]) + int(vals[1])
    elif '-' in val:
        vals = val.split('-')
        return int(vals[0]) + int(vals[1])
    else:
        return int(val)

In [22]:
for pos, pos_df in df_dict.items():
    for col in pos_df.columns:
        pos_df[col] = pos_df[col].apply(remove_signs)

## Model

In [37]:
algorithms = {}

###### K-means

In [38]:
k = 400

In [39]:
algorithms['kmeans_{}'.format(k)] = cluster.KMeans(n_clusters=k)

In [40]:
#for k in range(10, 110, 10):
#    algorithms['kmeans_{}'.format(k)] = cluster.KMeans(n_clusters=k)
#    algorithms['agglom_{}'.format(k)] = cluster.AgglomerativeClustering(n_clusters=k)
#    algorithms['spectral_{}'.format(k)] = cluster.SpectralClustering(n_clusters=k)

In [41]:
#for d in np.arange(0.5, 1.05, 0.1):
#    algorithms['affinity_{}'.format(d)] = cluster.AffinityPropagation(damping=0.6)

#### Models

In [47]:
def run_models(df_dict):
    results_dict = {}
    for pos, pos_df in df_dict.items():
        print('\nPosition: {}'.format(pos))
        mat = pos_df.values
        results_dict[pos] = {}
        for model, clf in algorithms.items():
            print(model)
            clf.fit(mat)
            res_df = pd.DataFrame([pos_df.index, clf.labels_]).T
            res_df.columns = ['ID', 'cat']
            final_df = pd.merge(res_df, players_df)
            results_dict[pos][model] = final_df
    return results_dict

In [55]:
results_dict = run_models(df_dict)


Position: fw
kmeans_400

Position: mf
kmeans_400

Position: def
kmeans_400

Position: gk
kmeans_400


## Results

In [56]:
def show_clusters(results_dict, max_cat=15):
    for pos, model_dict in results_dict.items():
        print('\nPosition: {}'.format(pos))
        for model, model_df in model_dict.items():
            print('Model: {}'.format(model))
            main_cats = model_df.head(max_cat)['cat'].unique()
            for c in main_cats:
                print('\nCat: {}'.format(c))
                print(model_df[model_df['cat'] == c])

In [57]:
show_clusters(results_dict)


Position: fw
Model: kmeans_400

Cat: 206
       ID  cat               Name
0   20801  206  Cristiano Ronaldo
3  176580  206          L. Suárez
8  173731  206            G. Bale

Cat: 216
        ID  cat       Name
1   158023  216   L. Messi
2   190871  216     Neymar
5   183277  216  E. Hazard
10  211110  216  P. Dybala
15    9014  216  A. Robben
21  188350  216    M. Reus

Cat: 29
         ID  cat            Name
4    188545   29  R. Lewandowski
14    41236   29  Z. Ibrahimović
16   179813   29       E. Cavani
17   202126   29         H. Kane
18   192505   29       R. Lukaku
22   179844   29     Diego Costa
112  153244   29       A. Gignac

Cat: 220
         ID  cat             Name
6    167664  220       G. Higuaín
24   165153  220       K. Benzema
34   201153  220           Morata
77   186627  220     M. Balotelli
82   171833  220     D. Sturridge
90   216354  220      A. Kramarić
114   49369  220  Fernando Torres
131  187491  220        F. Smolov
165  198683  220    M. Gabbiadini
