## Import Dependencies

In [None]:
from sklearn import cluster
import pandas as pd
import numpy as np
from math import exp

In [None]:
df = pd.read_csv('players_season_stats.csv')

In [None]:
teams_df = pd.read_csv('teams_season_stats.csv')

In [None]:
def get_possession_df(teams_df):
    teams_df['possession'] = teams_df['possession'] * 100
    games_pos_df = teams_df[['team', 'possession']]
    teams_pos_df = games_pos_df.groupby('team').agg(['sum', 'count']).reset_index()
    teams_pos_df.columns = ['team_name', 'sum', 'count']
    teams_pos_df['possession'] = teams_pos_df['sum'] / teams_pos_df['count']
    return teams_pos_df

In [None]:
teams_pos_df = get_possession_df(teams_df)

## Build dataset

#### Take a look at the initial dataset

In [None]:
df.head()

#### Filter by mins played

In [None]:
MINIMUM_MINUTES_PLAYED = 300

In [None]:
df = df[df['mins_played'] >= MINIMUM_MINUTES_PLAYED]

#### Names + id

In [None]:
players_df = df[['name', 'id', 'team_name']]

#### Define stats for each position

In [None]:
stats_dict = {}

In [None]:
stats_dict['total_stats'] = [
    'goals', 'headed_goals', 'kicked_goals', 'game_winning_goals', 'pk_goals',
    'assists', 'game_winning_assist', 'shots', 'shots_on_goal', 'crosses',
    'passes', 'touches', 'interceptions', 'blocks', 'tackles', 'clears',
    'goal_mouth_blocks', 'fouls_committed', 'fouls_suffered',
    'shot_conversion', 'shot_accuracy', 'adj_tackles', 'adj_interceptions', 'adj_blocks'
]

stats_dict['F'] = [
    'goals', 'headed_goals', 'kicked_goals',
    'assists', 'shots', 'shots_on_goal', 'crosses',
    'passes', 'touches', 'fouls_suffered',
    'shot_conversion', 'shot_accuracy'
]

stats_dict['M'] = [
    'goals', 'kicked_goals',
    'assists', 'shots', 'shots_on_goal', 'crosses',
    'passes', 'touches',
    'goal_mouth_blocks', 'fouls_committed', 'fouls_suffered',
    'shot_conversion', 'shot_accuracy', 'adj_tackles', 'adj_interceptions'
]

stats_dict['D'] = [
    'goals', 'headed_goals', 'crosses',
    'passes', 'touches', 'clears','goal_mouth_blocks', 
    'fouls_committed','adj_tackles', 'adj_interceptions', 'adj_blocks'
]

#### Build stats

In [None]:
def adjust_tackles(tack, pos):
    return tack * 2 / (1 + exp(-0.1 * (pos - 50)))

In [None]:
def adjust_def_plays(df, teams_df, stats):
    merged_df = pd.merge(df, teams_df, how='left')
    for s in stats:
        merged_df['adj_{}'.format(s)] = merged_df.apply(
            lambda x: adjust_tackles(x[s], x['possession']), axis=1
        )
    return merged_df.fillna(0)

In [None]:
df['shot_accuracy'] = round(100 * df['shots_on_goal'] / df['shots']).fillna(0)
df['shot_conversion'] = round(100 * df['goals'] / df['shots_on_goal']
                              ).fillna(0)

In [None]:
adjusted_stats = ['tackles', 'interceptions', 'blocks']

In [None]:
df = adjust_def_plays(df, teams_pos_df, adjusted_stats)

In [None]:
stats_90 = [
        s for s in stats_dict['total_stats'] if s not in ['shot_conversion', 'shot_accuracy']
    ]

In [None]:
for s in stats_90:
    df[s] = 90 * df[s] / df['mins_played']

In [None]:
df.head()

#### Get dict with positions dfs

In [None]:
df_dict = {}

for pos, pos_stats in stats_dict.items():
    if pos != 'total_stats':
        cols = ['id'] + pos_stats
        df_dict[pos] = df[df['position'] == pos][cols].set_index('id').fillna(0)

## Model

###### K-means

In [None]:
k = 35

In [None]:
algorithm = cluster.KMeans(n_clusters=k)

#### Models

In [None]:
def run_models(df_dict, clf):
    results_dict = {}
    for pos, pos_df in df_dict.items():
        print('\nPosition: {}'.format(pos))
        mat = pos_df.values
        results_dict[pos] = {}
        clf.fit(mat)
        res_df = pd.DataFrame([pos_df.index, clf.labels_]).T
        res_df.columns = ['id', 'cat']
        final_df = pd.merge(res_df, players_df)
        results_dict[pos] = final_df
    return results_dict

In [None]:
results_dict = run_models(df_dict, algorithm)

## Results

#### Specific Player

In [None]:
def get_player_cluster(players):
    pos_cats = {}
    for pos, pos_df in results_dict.items():
        player_df = pos_df[pos_df['name'].isin(players)]
        if not player_df.empty:
            pos_cats[pos] = player_df['cat'].tolist()
    return pos_cats

In [None]:
def print_sel_clusters(players, results_dict):
    pos_cats = get_player_cluster(players)
    for pos, cats in pos_cats.items():
        print('\nPosition: ' + pos)
        res_df = results_dict[pos]
        for cat in cats:
            print('\n', res_df[res_df['cat'] == cat])

In [None]:
players_to_show = ['Nicolas Blandi', 'Lucas Pratto', 'Cristian Pavon', 'Ricardo Centurion', 'Matias Caruzzo',
                  'Leonardo Ponzio', 'Fernando Gago', 'Ignacio Scocco', 'Rodrigo Mora', 'Maximiliano Meza']

In [None]:
print_sel_clusters(players_to_show, results_dict)