## Import Dependencies

In [None]:
from sklearn import cluster
import pandas as pd
import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score

In [None]:
df = pd.read_csv('CompleteDataset.csv').fillna(0)

## Build dataset

#### Take a look at the initial dataset

In [None]:
df.head()

#### Check for duplicates

In [None]:
df['ID'].value_counts().head()

#### Remove duplicates

In [None]:
df.drop(df.columns[0],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df = df.drop_duplicates()

In [None]:
df['ID'].value_counts().head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['pref_pos'] = df['Preferred Positions'].str.split()

#### Names + id

In [None]:
players_df = df[['Name', 'ID']]

#### Create positions dictionary

In [None]:
pos_dict = {
    'fw': ['ST', 'LW', 'RW', 'SS'],
    'gk': ['GK'],
    'mf': ['DM', 'CM', 'CDM', 'CAM', 'RM', 'LM',],
    'def': ['CB', 'RB', 'LB', 'RWB', 'LWB']
}

#### Insert positions

In [None]:
def get_positions(pos_list, pos_dict_list):
    for pos in pos_dict_list:
        if pos in pos_list:
            return 1

In [None]:
def insert_positions(df, pos_dict):
    for pos, pos_dict_list in pos_dict.items():
        df[pos] = df['pref_pos'].apply(lambda x: get_positions(x, pos_dict_list)).fillna(0)

In [None]:
insert_positions(df, pos_dict)

In [None]:
df[['pref_pos', 'fw', 'gk', 'mf', 'def']].head()

#### Define stats for each position

In [None]:
global_stats = ['CAM', 'CB', 'CDM', 'CF', 'CM',
       'LAM', 'LB', 'LCB', 'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB',
       'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM',
       'RS', 'RW', 'RWB', 'ST']

In [None]:
stats_dict = {}

stats_dict['fw'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Curve', 'Dribbling', 
    'Finishing', 'Heading accuracy', 'Jumping', 'Long shots','Positioning', 'Crossing', 'Long passing', 
    'Shot power', 'Sprint speed', 'Stamina', 'Strength', 'Vision'
]

stats_dict['mf'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Curve', 'Dribbling',
    'Long shots', 'Positioning', 'Crossing', 'Long passing', 'Shot power', 'Sprint speed', 'Stamina', 
    'Strength', 'Vision', 'Sliding tackle', 'Standing tackle', 'Marking'
]

stats_dict['def'] = [
    'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Positioning', 'Long passing',
    'Sprint speed', 'Stamina', 'Strength', 'Vision', 'Sliding tackle', 'Standing tackle', 'Marking',
    'Jumping', 'Heading accuracy'
]

stats_dict['gk'] = [col for col in df.columns if 'GK' in col] + ['Reactions', 'Volleys']

#### Get dict with positions dfs

In [None]:
df_dict = {}

for pos, pos_stats in stats_dict.items():
    cols = ['ID'] + pos_stats + global_stats
    df_dict[pos] = df[df[pos] > 0][cols].set_index('ID').fillna(0)

#### Clean dataset

In [None]:
def remove_signs(val):
    if isinstance(val, int) or isinstance(val, float):
        return val
    if '+' in val:
        vals = val.split('+')
        return int(vals[0]) + int(vals[1])
    elif '-' in val:
        vals = val.split('-')
        return int(vals[0]) + int(vals[1])
    else:
        return int(val)

In [None]:
for pos, pos_df in df_dict.items():
    print(pos, pos_df.shape[0])
    for col in pos_df.columns:
        pos_df[col] = pos_df[col].apply(remove_signs)

## Model

In [None]:
algorithms = {}

###### K-means

In [None]:
k = 400

In [None]:
algorithms['kmeans_{}'.format(k)] = cluster.KMeans(n_clusters=k)

In [None]:
#for k in range(10, 110, 10):
#    algorithms['kmeans_{}'.format(k)] = cluster.KMeans(n_clusters=k)
#    algorithms['agglom_{}'.format(k)] = cluster.AgglomerativeClustering(n_clusters=k)
#    algorithms['spectral_{}'.format(k)] = cluster.SpectralClustering(n_clusters=k)

In [None]:
#for d in np.arange(0.5, 1.05, 0.1):
#    algorithms['affinity_{}'.format(d)] = cluster.AffinityPropagation(damping=0.6)

#### Models

In [None]:
def run_models(df_dict):
    results_dict = {}
    for pos, pos_df in df_dict.items():
        print('\nPosition: {}'.format(pos))
        mat = pos_df.values
        results_dict[pos] = {}
        for model, clf in algorithms.items():
            print(model)
            clf.fit(mat)
            res_df = pd.DataFrame([pos_df.index, clf.labels_]).T
            res_df.columns = ['ID', 'cat']
            final_df = pd.merge(res_df, players_df)
            results_dict[pos][model] = final_df
    return results_dict

In [None]:
results_dict = run_models(df_dict)

## Results

In [None]:
def show_clusters(results_dict, max_cat=15):
    for pos, model_dict in results_dict.items():
        print('\nPosition: {}'.format(pos))
        for model, model_df in model_dict.items():
            print('Model: {}'.format(model))
            main_cats = model_df.head(max_cat)['cat'].unique()
            for c in main_cats:
                print('\nCat: {}'.format(c))
                print(model_df[model_df['cat'] == c])

In [None]:
show_clusters(results_dict)