In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [3]:
# Read in cleaned and combined data
df_full = pd.read_csv('https://raw.githubusercontent.com/brendad8/Datasets/main/t5_leagues.csv', index_col=0)

In [4]:
# Machine Learning Plan

# 1. Select Variables for initial model and variables for specific group models (attack, defense, hybrid)
#     a. Diff models for att/def/hyb since these roles prioritize different variables

# 2. Fit K means Clustering with k=3 on Data
#     a. Min 1000 minutes played and only Non-Goalkeepers
#     b. k=3 to differentiate attackers, defenders, and those who do both

# 3. For each Group fit K means Clustering with k=4
#     a. k=4 chosen since it produced most logical results according to me
#     b. Lower k values produced groups that were too broad and
#        larger ks produced groups that were too specialized

# 4. Look at Cluster Subgroup centers to determine common traits of each Subgroup

## Variable Selection for Models

Since I am working with tables with so many different variables, This part is me looking at the available columns and choosing the appropriate columns for the 4 models I made.

* The columns in model_cols are for the intial K means model with k=3.  This is to differentiate Att/Both/Def.  
* The other model_cols are more specific selections to priortize different areas for Att/Def/Hyb.  These models were fit after the first on the different groups separetely.  
* Per 90 cols are columns that need to be converted to per 90 stats before fitting the model

In [5]:
# Columns to use from standard Table
std_cols = ['PK', 'PKatt', 'Gls/90', 'Ast/90', 'xG/90', 'xAG/90', 'CrdY', 'CrdR']
def_std_cols = ['Gls/90', 'Ast/90', 'xG/90', 'xAG/90']
att_std_cols = ['Gls/90', 'Ast/90', 'xG/90', 'xAG/90']
hyb_std_cols = ['Gls/90', 'Ast/90', 'xG/90', 'xAG/90']

In [6]:
# Columns to use from passing table
pass_cols = ['pass.Att', 'pass.Cmp%', 'pass.TotDist', 'pass.PrgDist', 'pass.short.Att', 'pass.med.Att', 'pass.long.Att', 'pass.KP', 'pass.1/3', 'pass.PPA', 'pass.CrsPA', 'pass.Prog']
def_pass_cols = pass_cols
att_pass_cols = ['pass.Att', 'pass.Cmp%', 'pass.short.Att', 'pass.med.Att', 'pass.long.Att', 'pass.KP', 'pass.1/3', 'pass.PPA', 'pass.CrsPA', 'pass.Prog']
hyb_pass_cols = pass_cols

per90_pass_cols = ['pass.Att', 'pass.TotDist', 'pass.PrgDist', 'pass.short.Att', 'pass.med.Att', 'pass.long.Att', 'pass.KP', 'pass.1/3', 'pass.PPA', 'pass.CrsPA', 'pass.Prog']

In [7]:
# Columns to use from shooting table
shot_cols = ['shoot.Sh/90', 'shoot.SoT/90', 'shoot.Dist', 'shoot.FK']
def_shot_cols = []
att_shot_cols = ['shoot.Sh/90', 'shoot.SoT/90', 'shoot.Dist']
hyb_shot_cols = ['shoot.Sh/90', 'shoot.SoT/90', 'shoot.Dist']

In [8]:
# Columns to use from defensive actions table
def_cols = ['defense.Plyrs_Tkld', 'defense.TklW', 'defense.Def 3rd', 'defense.Mid 3rd', 'defense.Att 3rd', 'defense.Tkl_Drib', 'defense.Past', 'defense.Blocks', 'defense.Int', 'defense.Clr']
def_def_cols = def_cols
att_def_cols = ['defense.Plyrs_Tkld', 'defense.TklW', 'defense.Def 3rd', 'defense.Mid 3rd', 'defense.Att 3rd']
hyb_def_cols = def_cols

per90_def_cols = def_cols

In [9]:
# Columns to use from posession table
pos_cols = ['pos.Touches', 'pos.Def Pen', 'pos.Def 3rd', 'pos.Mid 3rd', 'pos.Att 3rd', 'pos.Att Pen', 'pos.drib.Att', 'pos.drib.Mis', 'pos.drib.Dis', 'pos.rec.Rec', 'pos.rec.Prog']
def_pos_cols = ['pos.Touches', 'pos.Def Pen', 'pos.Def 3rd', 'pos.Mid 3rd', 'pos.Att 3rd', 'pos.drib.Att', 'pos.drib.Mis', 'pos.drib.Dis', 'pos.rec.Rec']
att_pos_cols = ['pos.Mid 3rd', 'pos.Att 3rd', 'pos.Att Pen',  'pos.drib.Att', 'pos.drib.Mis', 'pos.drib.Dis', 'pos.rec.Rec', 'pos.rec.Prog']
hyb_pos_cols = pos_cols

per90_pos_cols = pos_cols

In [10]:
# Columns to use from shot creating actions table
sca_cols = ['sca.S.SCA90', 'sca.S.PassLive', 'sca.S.PassDead', 'sca.S.Drib', 'sca.S.Fld', 'sca.S.Def', 'sca.S.GCA90']
def_sca_cols = ['sca.S.SCA90', 'sca.S.PassLive', 'sca.S.Drib']
att_sca_cols = ['sca.S.SCA90', 'sca.S.PassLive', 'sca.S.Drib', 'sca.S.Fld', 'sca.S.Def', 'sca.S.GCA90']
hyb_sca_cols = ['sca.S.SCA90', 'sca.S.PassLive', 'sca.S.Drib', 'sca.S.Fld', 'sca.S.Def', 'sca.S.GCA90']

per90_sca_cols  = ['sca.S.PassLive', 'sca.S.PassDead', 'sca.S.Drib', 'sca.S.Fld', 'sca.S.Def']

In [11]:
# Combine columns into single list
model_cols = std_cols + pass_cols + shot_cols + def_cols + pos_cols + sca_cols
def_model_cols = def_std_cols + def_pass_cols + def_shot_cols + def_def_cols + def_pos_cols + def_sca_cols
att_model_cols = att_std_cols + att_pass_cols + att_shot_cols + att_def_cols + att_pos_cols + att_sca_cols
hyb_model_cols = hyb_std_cols + hyb_pass_cols + hyb_shot_cols + hyb_def_cols + hyb_pos_cols + hyb_sca_cols

per90_cols = per90_pass_cols + per90_def_cols + per90_pos_cols + per90_sca_cols

## Fitting the Model

### Assign Base Group

Defender / Hybrid / Attacker

Filter Data for Model

In [12]:
# Only fit model on Non-Goalkeepers and players who played at least 1000 minutes
# Convert per 90 Variables to per 90 values
# Select only relevant variables for first-layer model

df_final = df_full[~(df_full['Pos'] == 'GK')]
df_final = df_final[df_final['Min'] >= 1000]

for col in per90_cols:
    df_final[col] = df_final[col] / df_final['90s']
df_model = df_final[model_cols].fillna(0)

In [13]:
# Use K Means Clustering with 3 Clusters (first-layer model)
# 3 Clusters to pull Attackers/Two-Way Players/Defenders
# Standardize values to equally weigh all variables

pipeline = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=3)
)
pipeline.fit(df_model)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=3))])

In [14]:
# Add group value for initial model to dataset
clusters = pipeline['kmeans'].labels_
df_final['BaseGroup'] = clusters

Get Numbers for Attackers and Defenders

  Intution:
* Whatever group Mbappe is in should be attackers
* Whatever group van Dijk is in should be defenders
* Other group number will be hybrid players

In [15]:
att_num = df_final[df_final['Player'] == 'Kylian Mbappé']['BaseGroup'].to_numpy()[0]
def_num = df_final[df_final['Player'] == 'Virgil van Dijk']['BaseGroup'].to_numpy()[0]
att_num, def_num

(0, 2)

In [16]:
# convert group numbers to Attack/Defense/Hybrid
def map_basegroup(num):
    if num == def_num:
        return 'Defense'
    elif num == att_num:
        return 'Attack'
    else:
        return 'Hybrid'

In [17]:
df_final['BaseGroup'] = df_final['BaseGroup'].map(map_basegroup)

Create seperate datasets for the groups to fit individual models

In [18]:
defense = df_final[df_final['BaseGroup']=='Defense'].copy()
defense_model = defense[def_model_cols].fillna(0)


In [19]:
hybrid = df_final[df_final['BaseGroup']=='Hybrid'].copy()
hybrid_model = hybrid[hyb_model_cols].fillna(0)

In [20]:
attack = df_final[df_final['BaseGroup']=='Attack'].copy()
attack_model = attack[att_model_cols].fillna(0)

In [21]:
# Decided with k=4 by trial and error
# k=3 made subgroups too specific and k=5 too broad (for my liking)
# Standardize values to equally weigh all variables

def_pipeline = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=4, n_init=10)
)

att_pipeline = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=4, n_init=10)
)

hyb_pipeline = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=4, n_init=10)
)

def_pipeline.fit(defense_model)
att_pipeline.fit(attack_model)
hyb_pipeline.fit(hybrid_model)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=4))])

In [22]:
defense['SubGroup'] = def_pipeline['kmeans'].labels_
attack['SubGroup'] = att_pipeline['kmeans'].labels_
hybrid['SubGroup'] = hyb_pipeline['kmeans'].labels_

Create scalers for different groups

Necessary for back-transforming values since they were standardized for the model

(makes interpreting results easier)

In [23]:
att_scaler = StandardScaler()
att_scaler.fit(attack_model)

hyb_scaler = StandardScaler()
hyb_scaler.fit(hybrid_model)

def_scaler = StandardScaler()
def_scaler.fit(defense_model)

StandardScaler()

## Results

Note: The group order below may not match the presentation due to the group numbers varying from run to run. However, the content of the four groups should be the same.

Results can be found in presentation ppt

Cells below were used to identify differences between groups


Code intuition:
* Grab all the cluster centers for the different subgroups
* Unstandardize values
* Turn data into long format for easier comparisons (*For me)

Attack

In [24]:
# Goal: Compare Cluster centers for four groups over the different model variables
# Find important differences between groups

att_clusters = pd.DataFrame(att_pipeline['kmeans'].cluster_centers_, columns=attack_model.columns)
att_clusters = pd.DataFrame(att_scaler.inverse_transform(att_clusters), columns=attack_model.columns).reset_index()
att_clusters_long = pd.melt(att_clusters, id_vars='index', value_vars=att_clusters.columns[1:])
cstr0 = pd.DataFrame(att_clusters.stack()[0], columns=['0'])
cstr1 = pd.DataFrame(att_clusters.stack()[1], columns=['1'])
cstr2 = pd.DataFrame(att_clusters.stack()[2], columns=['2'])
cstr3 = pd.DataFrame(att_clusters.stack()[3], columns=['3'])
df_att_clusters = pd.concat([cstr0, cstr1, cstr2, cstr3], axis = 1).drop('index', axis=0)
df_att_clusters

Unnamed: 0,0,1,2,3
Gls/90,0.295439,0.388373,0.190207,0.416081
Ast/90,0.279649,0.106867,0.129241,0.225946
xG/90,0.295088,0.406687,0.221241,0.402297
xAG/90,0.28614,0.117952,0.150552,0.243784
pass.Att,49.731587,21.704859,31.901316,36.127929
pass.Cmp%,76.677193,68.19759,71.281379,74.464865
pass.short.Att,23.713801,11.732145,15.566999,18.22561
pass.med.Att,15.371692,5.684941,9.625192,10.556595
pass.long.Att,6.298884,1.340398,3.366266,3.643659
pass.KP,2.249917,0.912048,1.240711,1.789559


Hybrid

In [25]:
hyb_clusters = pd.DataFrame(hyb_pipeline['kmeans'].cluster_centers_, columns=hybrid_model.columns)
hyb_clusters = pd.DataFrame(hyb_scaler.inverse_transform(hyb_clusters), columns=hybrid_model.columns).reset_index()
hyb_clusters_long = pd.melt(hyb_clusters, id_vars='index', value_vars=hyb_clusters.columns[1:])
cstr0 = pd.DataFrame(hyb_clusters.stack()[0], columns=['0'])
cstr1 = pd.DataFrame(hyb_clusters.stack()[1], columns=['1'])
cstr2 = pd.DataFrame(hyb_clusters.stack()[2], columns=['2'])
cstr3 = pd.DataFrame(hyb_clusters.stack()[3], columns=['3'])
df_hyb_clusters = pd.concat([cstr0, cstr1, cstr2, cstr3], axis = 1).drop('index', axis=0)
df_hyb_clusters

Unnamed: 0,0,1,2,3
Gls/90,0.038712,0.115122,0.056357,0.110187
Ast/90,0.060038,0.211463,0.074961,0.10925
xG/90,0.052386,0.118537,0.060853,0.127562
xAG/90,0.07375,0.212073,0.090853,0.124938
pass.Att,46.037155,67.238435,63.464204,42.526496
pass.Cmp%,76.10947,81.171951,83.813953,76.2675
pass.TotDist,581.641935,921.089904,915.750538,522.580013
pass.PrgDist,205.27838,285.871046,285.854521,156.592014
pass.short.Att,20.158855,30.26997,27.594524,19.67635
pass.med.Att,16.904676,23.883332,24.558831,14.476667


Defense

In [26]:
def_clusters = pd.DataFrame(def_pipeline['kmeans'].cluster_centers_, columns=defense_model.columns)
def_clusters = pd.DataFrame(def_scaler.inverse_transform(def_clusters), columns=defense_model.columns).reset_index()
def_clusters_long = pd.melt(def_clusters, id_vars='index', value_vars=def_clusters.columns[1:])
cstr0 = pd.DataFrame(def_clusters.stack()[0], columns=['0'])
cstr1 = pd.DataFrame(def_clusters.stack()[1], columns=['1'])
cstr2 = pd.DataFrame(def_clusters.stack()[2], columns=['2'])
cstr3 = pd.DataFrame(def_clusters.stack()[3], columns=['3'])
df_def_clusters = pd.concat([cstr0, cstr1, cstr2, cstr3], axis = 1).drop('index', axis=0)
df_def_clusters

Unnamed: 0,0,1,2,3
Gls/90,0.054947,0.040949,0.036238,0.0768
Ast/90,0.015474,0.015547,0.028218,0.0488
xG/90,0.062211,0.048686,0.054851,0.0764
xAG/90,0.024,0.016715,0.036535,0.0484
pass.Att,64.472166,42.902576,49.043875,77.025526
pass.Cmp%,87.555789,80.627737,79.671287,89.308
pass.TotDist,1100.065699,697.258099,698.969067,1260.15597
pass.PrgDist,396.874606,262.22311,255.003497,445.837403
pass.short.Att,20.692994,12.643375,19.000693,29.362164
pass.med.Att,32.735339,21.08066,21.11155,36.345783


In [27]:
# uploaded to github

# defense.to_csv()
# hybrid.to_csv()
# attack.to_csv()

### Scouted Player Comparisons

This is where I compare the three players I picked from using the similarity functions (see similarity_functions file) to their respective subgroup centers.


The three players I ended up picking were:

1. Lovro Majer (attacker)

2. Oleksandr Zinchenko (hybrid)

3. Manuel Akanji (defender)


#### Lovro Majer

In [28]:
majer_subgroup = attack[attack['Player'] == 'Lovro Majer']['SubGroup'].reset_index(drop=True)[0]
majer_subgroup

0

In [29]:
# df for Majer vs Cluster Center

majer = pd.DataFrame(attack[attack['Player'] == 'Lovro Majer'][att_model_cols]).reset_index(drop=True).rename(index={0: 'Majer'})
cluster_center = att_clusters.iloc[majer_subgroup:majer_subgroup+1].drop('index', axis=1).rename(index={0: 'Cluster_Center'})
pd.concat([cluster_center, majer]).transpose().rename(index={majer_subgroup:'Cluster_center'})

Unnamed: 0,Cluster_Center,Majer
Gls/90,0.295439,0.29
Ast/90,0.279649,0.39
xG/90,0.295088,0.26
xAG/90,0.28614,0.29
pass.Att,49.731587,51.568627
pass.Cmp%,76.677193,76.1
pass.short.Att,23.713801,21.715686
pass.med.Att,15.371692,18.186275
pass.long.Att,6.298884,7.843137
pass.KP,2.249917,2.205882


Other players in Majers SubGroup

In [30]:
attack[attack['SubGroup'] == majer_subgroup]['Player'].to_list()

['Emi Buendía',
 'Philippe Coutinho',
 'Kevin De Bruyne',
 'İlkay Gündoğan',
 'James Maddison',
 'Mason Mount',
 'Lucas Moura',
 'Michael Olise',
 'Daniel Podence',
 'Raphinha',
 'Hakim Ziyech',
 'Martin Ødegaard',
 'Julian Brandt',
 'Serge Gnabry',
 'Vincenzo Grifo',
 'Jonas Hofmann',
 'Eduard Löwen',
 'Thomas Müller',
 'Jamal Musiala',
 'Marco Reus',
 'Leroy Sané',
 'Lars Stindl',
 'Dominik Szoboszlai',
 'Florian Wirtz',
 'Ousmane Dembélé',
 'Nabil Fekir',
 'Adnan Januzaj',
 'Érik Lamela',
 'Thomas Lemar',
 'Iker Muniain',
 'David Silva',
 'Óscar Trejo',
 'Houssem Aouar',
 'Sofiane Boufal',
 'David Pereira da Costa',
 'Ángel Di María',
 'Sofiane Diop',
 'Romain Faivre',
 'Amine Harit',
 'Ilan Kebbal',
 'Lovro Majer',
 'Lionel Messi',
 'Neymar',
 'Lucas Paquetá',
 'Dimitri Payet',
 'Téji Savanier',
 'Domenico Berardi',
 'Antonio Candreva',
 'Gianluca Caprari',
 'Paulo Dybala',
 'Lorenzo Insigne',
 'Hamed Junior Traorè',
 'Ruslan Malinovskyi',
 'Lorenzo Pellegrini',
 'Alexis Saelemaeke

#### Oleksandr Zinchenko

In [31]:
zinchenko_subgroup = hybrid[hybrid['Player'] == 'Oleksandr Zinchenko']['SubGroup'].reset_index(drop=True)[0]
zinchenko_subgroup

1

In [32]:
# df for Zinchenko vs Cluster Center

zinchenko = pd.DataFrame(hybrid[hybrid['Player'] == 'Oleksandr Zinchenko'][hyb_model_cols]).reset_index(drop=True).rename(index={0: 'Zinchenko'})
cluster_center = hyb_clusters.iloc[zinchenko_subgroup:zinchenko_subgroup+1].drop('index', axis=1).rename(index={zinchenko_subgroup: 'Cluster_Center'})
pd.concat([cluster_center, zinchenko]).transpose().rename(index={0:'Cluster_center'})

Unnamed: 0,Cluster_Center,Zinchenko
Gls/90,0.115122,0.0
Ast/90,0.211463,0.34
xG/90,0.118537,0.05
xAG/90,0.212073,0.22
pass.Att,67.238435,92.5
pass.Cmp%,81.171951,88.8
pass.TotDist,921.089904,1244.741379
pass.PrgDist,285.871046,354.224138
pass.short.Att,30.26997,48.448276
pass.med.Att,23.883332,33.362069


Other players in Zinchenko's subgroup

In [33]:
hybrid[hybrid['SubGroup'] == zinchenko_subgroup]['Player'].to_list()

['Thiago Alcántara',
 'Trent Alexander-Arnold',
 'Marcos Alonso',
 'João Cancelo',
 'Lucas Digne',
 'Bruno Fernandes',
 'Pascal Groß',
 'Jordan Henderson',
 'Reece James',
 "N'Golo Kanté",
 'Mateo Kovačić',
 'Manuel Lanzini',
 'Ruben Loftus-Cheek',
 'Paul Pogba',
 'Andrew Robertson',
 'Luke Shaw',
 'Bernardo Silva',
 'Youri Tielemans',
 'James Ward-Prowse',
 'Granit Xhaka',
 'Oleksandr Zinchenko',
 'Angeliño',
 'Mahmoud Dahoud',
 'Alphonso Davies',
 'Kerem Demirbay',
 'Leon Goretzka',
 'Raphaël Guerreiro',
 'Joshua Kimmich',
 'Filip Kostić',
 'Thomas Meunier',
 'David Raum',
 'Borna Sosa',
 'Marcos Acuña',
 'Jordi Alba',
 'Dani Alves',
 'Sergio Canales',
 'William Carvalho',
 'Sergi Darder',
 'Rodrigo De Paul',
 'Frenkie de Jong',
 'Toni Kroos',
 'Unai López',
 'Ferland Mendy',
 'Luka Modrić',
 'Jesús Navas',
 'Daniel Parejo',
 'Ivan Rakitić',
 'Salva Sevilla',
 'Denis Suárez',
 'Óliver Torres',
 'Federico Valverde',
 'Lucas Vázquez',
 'Yacine Adli',
 'Jason Berthomier',
 'Ryad Boudebo

#### Manuel Akanji

In [34]:
akanji_subgroup = defense[defense['Player'] == 'Manuel Akanji']['SubGroup'].reset_index(drop=True)[0]
akanji_subgroup

0

In [35]:
# df for Akanji vs Cluster Center

akanji = pd.DataFrame(defense[defense['Player'] == 'Manuel Akanji'][hyb_model_cols]).reset_index(drop=True).rename(index={0: 'Akanji'})
cluster_center = def_clusters.iloc[akanji_subgroup:akanji_subgroup+1].drop('index', axis=1).rename(index={akanji_subgroup: 'Cluster_Center'})
pd.concat([cluster_center, akanji]).transpose().rename(index={0:'Cluster_center'})

Unnamed: 0,Cluster_Center,Akanji
Gls/90,0.054947,0.04
Ast/90,0.015474,0.0
xG/90,0.062211,0.08
xAG/90,0.024,0.01
pass.Att,64.472166,80.199203
pass.Cmp%,87.555789,90.4
pass.TotDist,1100.065699,1447.011952
pass.PrgDist,396.874606,551.23506
pass.short.Att,20.692994,25.059761
pass.med.Att,32.735339,40.557769


Other players in same subgroup as Akanji

In [36]:
defense[defense['SubGroup'] == akanji_subgroup]['Player'].to_list()

['Daniel Amartey',
 'Joachim Andersen',
 'Dan Burn',
 'Andreas Christensen',
 'Liam Cooper',
 'Eric Dier',
 'Gabriel Dos Santos',
 'Shane Duffy',
 'Lewis Dunk',
 'Jonny Evans',
 'Marc Guéhi',
 'Harry Maguire',
 'Cristian Romero',
 'Romain Saïss',
 'Davinson Sánchez',
 'Çağlar Söyüncü',
 'Virgil van Dijk',
 'Raphaël Varane',
 'Adam Webster',
 'Ben White',
 'Manuel Akanji',
 'Waldemar Anton',
 'John Brooks',
 'Rafael Czichos',
 'Nico Elvedi',
 'Makoto Hasebe',
 'Martin Hinteregger',
 'Timo Hübers',
 'Mats Hummels',
 'Hiroki Ito',
 'Philipp Lienhart',
 'Konstantinos Mavropanos',
 'Willi Orban',
 'Chris Richards',
 'Jonathan Tah',
 'Edmond Tapsoba',
 'Kevin Vogt',
 'Raúl Albiol',
 'Ronald Araújo',
 'Pedro Bigas',
 'Diego Carlos',
 'Édgar González',
 'Nemanja Gudelj',
 'Robin Le Normand',
 'Aïssa Mandi',
 'Éder Militão',
 'Jeison Murillo',
 'Nacho',
 'Gerard Piqué',
 'Víctor Ruiz',
 'Pau Torres',
 'Igor Zubeldia',
 'Nayef Aguerd',
 'Benoît Badiashile',
 'Jérôme Boateng',
 'Sven Botman',
 'D