## Import Dependencies

In [3]:
from sklearn import cluster
import pandas as pd
import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score

In [4]:
df = pd.read_csv('CompleteDataset.csv').fillna(0)

  interactivity=interactivity, compiler=compiler, result=result)


## Build dataset

#### Take a look at the initial dataset

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
0,0,Cristiano Ronaldo,32,https://cdn.sofifa.org/48/18/players/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,...,61.0,53.0,82.0,62.0,91.0,89.0,92.0,91.0,66.0,92.0
1,1,L. Messi,30,https://cdn.sofifa.org/48/18/players/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,93,93,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,...,57.0,45.0,84.0,59.0,92.0,90.0,88.0,91.0,62.0,88.0
2,2,Neymar,25,https://cdn.sofifa.org/48/18/players/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,94,Paris Saint-Germain,https://cdn.sofifa.org/24/18/teams/73.png,...,59.0,46.0,79.0,59.0,88.0,87.0,84.0,89.0,64.0,84.0
3,3,L. Suárez,30,https://cdn.sofifa.org/48/18/players/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,92,92,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,...,64.0,58.0,80.0,65.0,88.0,85.0,88.0,87.0,68.0,88.0
4,4,M. Neuer,31,https://cdn.sofifa.org/48/18/players/167495.png,Germany,https://cdn.sofifa.org/flags/21.png,92,92,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Check for duplicates

In [6]:
df['ID'].value_counts().head()

210679    2
212153    2
201878    2
225199    2
204923    2
Name: ID, dtype: int64

#### Remove duplicates

In [7]:
df.drop(df.columns[0],axis=1,inplace=True)

In [8]:
df.shape

(17981, 74)

In [9]:
df = df.drop_duplicates()

In [10]:
df['ID'].value_counts().head()

231423    1
213661    1
237430    1
230037    1
223896    1
Name: ID, dtype: int64

In [11]:
df.shape

(17929, 74)

In [12]:
df.columns

Index(['Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall', 'Potential',
       'Club', 'Club Logo', 'Value', 'Wage', 'Special', 'Acceleration',
       'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure',
       'Crossing', 'Curve', 'Dribbling', 'Finishing', 'Free kick accuracy',
       'GK diving', 'GK handling', 'GK kicking', 'GK positioning',
       'GK reflexes', 'Heading accuracy', 'Interceptions', 'Jumping',
       'Long passing', 'Long shots', 'Marking', 'Penalties', 'Positioning',
       'Reactions', 'Short passing', 'Shot power', 'Sliding tackle',
       'Sprint speed', 'Stamina', 'Standing tackle', 'Strength', 'Vision',
       'Volleys', 'CAM', 'CB', 'CDM', 'CF', 'CM', 'ID', 'LAM', 'LB', 'LCB',
       'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB', 'Preferred Positions',
       'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM', 'RS', 'RW', 'RWB', 'ST'],
      dtype='object')

In [13]:
df['pref_pos'] = df['Preferred Positions'].str.split()

#### Names + id

In [14]:
players_df = df[['Name', 'ID', 'Club']]

#### Create positions dictionary

In [15]:
pos_dict = {
    'fw': ['ST', 'LW', 'RW', 'SS'],
    'gk': ['GK'],
    'mf': ['DM', 'CM', 'CDM', 'CAM', 'RM', 'LM',],
    'def': ['CB', 'RB', 'LB', 'RWB', 'LWB']
}

#### Insert positions

In [16]:
def get_positions(pos_list, pos_dict_list):
    for pos in pos_dict_list:
        if pos in pos_list:
            return 1

In [17]:
def insert_positions(df, pos_dict):
    for pos, pos_dict_list in pos_dict.items():
        df[pos] = df['pref_pos'].apply(lambda x: get_positions(x, pos_dict_list)).fillna(0)

In [18]:
insert_positions(df, pos_dict)

In [19]:
df[['pref_pos', 'fw', 'gk', 'mf', 'def']].head()

Unnamed: 0,pref_pos,fw,gk,mf,def
0,"[ST, LW]",1.0,0.0,0.0,0.0
1,[RW],1.0,0.0,0.0,0.0
2,[LW],1.0,0.0,0.0,0.0
3,[ST],1.0,0.0,0.0,0.0
4,[GK],0.0,1.0,0.0,0.0


#### Define stats for each position

In [20]:
global_stats = ['CAM', 'CB', 'CDM', 'CF', 'CM',
       'LAM', 'LB', 'LCB', 'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB',
       'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM',
       'RS', 'RW', 'RWB', 'ST']

In [21]:
stats_dict = {}

stats_dict['fw'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Curve', 'Dribbling', 
    'Finishing', 'Heading accuracy', 'Jumping', 'Long shots','Positioning', 'Crossing', 'Long passing', 
    'Shot power', 'Sprint speed', 'Stamina', 'Strength', 'Vision'
]

stats_dict['mf'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Curve', 'Dribbling',
    'Long shots', 'Positioning', 'Crossing', 'Long passing', 'Shot power', 'Sprint speed', 'Stamina', 
    'Strength', 'Vision', 'Sliding tackle', 'Standing tackle', 'Marking'
]

stats_dict['def'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Positioning', 'Long passing',
    'Sprint speed', 'Stamina', 'Strength', 'Vision', 'Sliding tackle', 'Standing tackle', 'Marking',
    'Jumping', 'Heading accuracy'
]

stats_dict['gk'] = [col for col in df.columns if 'GK' in col] + ['Reactions', 'Volleys']

#### Get dict with positions dfs

In [22]:
df_dict = {}

for pos, pos_stats in stats_dict.items():
    cols = ['ID'] + pos_stats + global_stats
    df_dict[pos] = df[df[pos] > 0][cols].set_index('ID').fillna(0)

#### Clean dataset

In [23]:
def remove_signs(val):
    if isinstance(val, int) or isinstance(val, float):
        return val
    if '+' in val:
        vals = val.split('+')
        return int(vals[0]) + int(vals[1])
    elif '-' in val:
        vals = val.split('-')
        return int(vals[0]) + int(vals[1])
    else:
        return int(val)

In [24]:
for pos, pos_df in df_dict.items():
    print(pos, pos_df.shape[0])
    for col in pos_df.columns:
        pos_df[col] = pos_df[col].apply(remove_signs)

fw 4059
mf 8490
def 6454
gk 2024


## Model

In [25]:
algorithms = {}

###### K-means

In [26]:
k = 400

In [27]:
algorithms['kmeans_{}'.format(k)] = cluster.KMeans(n_clusters=k)

In [28]:
#for k in range(10, 110, 10):
#    algorithms['kmeans_{}'.format(k)] = cluster.KMeans(n_clusters=k)
#    algorithms['agglom_{}'.format(k)] = cluster.AgglomerativeClustering(n_clusters=k)
#    algorithms['spectral_{}'.format(k)] = cluster.SpectralClustering(n_clusters=k)

In [29]:
#for d in np.arange(0.5, 1.05, 0.1):
#    algorithms['affinity_{}'.format(d)] = cluster.AffinityPropagation(damping=0.6)

#### Models

In [30]:
def run_models(df_dict):
    results_dict = {}
    for pos, pos_df in df_dict.items():
        print('\nPosition: {}'.format(pos))
        mat = pos_df.values
        results_dict[pos] = {}
        for model, clf in algorithms.items():
            print(model)
            clf.fit(mat)
            res_df = pd.DataFrame([pos_df.index, clf.labels_]).T
            res_df.columns = ['ID', 'cat']
            final_df = pd.merge(res_df, players_df)
            results_dict[pos][model] = final_df
    return results_dict

In [31]:
results_dict = run_models(df_dict)


Position: fw
kmeans_400

Position: mf
kmeans_400

Position: def
kmeans_400

Position: gk
kmeans_400


## Results

#### Specific Player

In [32]:
def get_player_cluster(player_name):
    for pos, model_dict in results_dict.items():
            for model, model_df in model_dict.items():
                player_df = model_df[model_df['Name'] == player_name]
                if not player_df.empty:
                    print('\n' + pos)
                    sel_cat = player_df['cat'].iloc[0]
                    print(model_df[model_df['cat'] == sel_cat])

In [37]:
player_name = 'F. Mussis'

In [38]:
get_player_cluster(player_name)


mf
          ID  cat            Name                              Club
688   216706  397       F. Mussis            San Lorenzo de Almagro
750   208620  397  Omar Mascarell               Eintracht Frankfurt
996   230417  397  Edercinho Sepa  Grêmio Foot-Ball Porto Alegrense
1073  160078  397      N. Fuenzas      CD Universidad de Concepción
1080  223058  397      D. Kuzyaev              Zenit St. Petersburg
1115  230250  397   Emilio Piodão                          Cruzeiro
1194  168883  397    T. Al Jassam                           Al Ahli
1230  159964  397    G. Suleziano                  Deportes Iquique
1237  190432  397            Oier                        CA Osasuna
1257  224242  397    D. Luckassen                               PSV
1290  156683  397       S. Holmén                       IF Elfsborg
1299  193550  397        B. Kayal            Brighton & Hove Albion
1316  175896  397        S. Siani                       KV Oostende
1324  214047  397        M. Uribe           

#### Best Players

In [None]:
def show_clusters(results_dict, max_cat=15):
    for pos, model_dict in results_dict.items():
        print('\nPosition: {}'.format(pos))
        for model, model_df in model_dict.items():
            print('Model: {}'.format(model))
            main_cats = model_df.head(max_cat)['cat'].unique()
            for c in main_cats:
                print('\nCat: {}'.format(c))
                print(model_df[model_df['cat'] == c])

In [None]:
show_clusters(results_dict)