### Load data

In [9]:
import pandas as pd 
import utils

# Set seed for reproducibility
SEED = 42

# load
df, player_info = utils.load_player_statistics()

# filter out players 
mask = (player_info["Matches Played"] > 8) & (player_info["Playing Time_Min"] > 60)
player_info = player_info[mask]
df = df[mask]

# remove goalkeeping stats
adv_goalkeeping = ['Goals_GA', 'Goals_PKA', 'Goals_FK', 'Goals_CK', 'Goals_OG', 'PSxG',
       'PSxG/SoT', 'PSxG+/-', '/90', 'Launched_Cmp', 'Launched_Att',
       'Launched_Cmp%', 'Passes_Att', 'Passes_Thr', 'Passes_Launch%',
       'Passes_AvgLen', 'Goal Kicks_Att', 'Goal Kicks_Launch%',
       'Goal Kicks_AvgLen', 'Opp', 'Stp', 'Stp%', '#OPA', '#OPA/90',
       'AvgDist']
goalkeeping_cols = ['GA',
       'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%',
       'Penalty Kicks_PKatt', 'Penalty Kicks_PKA', 'Penalty Kicks_PKsv',
       'Penalty Kicks_PKm', 'Penalty Kicks_Save%']

playing_time_cols = ['Playing Time_Minutes', 'Playing Time_Mn/MP','Starts', 'Mn/Start', 'Compl',
                     'Subs', 'unSub', 'PPM','onG', 'onGA','On-Off'
                    ]
col_to_drop = adv_goalkeeping + goalkeeping_cols + playing_time_cols

df = df.drop(columns=col_to_drop)
df

Unnamed: 0_level_0,Tackles_Att,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Interceptions,Clearances,Errors,Dribblers_Tkl_Succ,Dribblers_Tkl_Att,Dribblers_Tkl_Lost,...,Playing Time_Mn/MP,Starts,Mn/Start,Compl,Subs,unSub,PPM,onG,onGA,On-Off
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ederson,3.0,3.0,0.0,0.0,1.0,10.0,2.0,1.0,2.0,1.0,...,90.0,35.0,90.0,35.0,0.0,3.0,2.34,89.0,32.0,0.30
Rodri,67.0,28.0,28.0,11.0,33.0,43.0,1.0,26.0,52.0,26.0,...,81.0,34.0,85.0,23.0,2.0,1.0,2.39,83.0,26.0,1.06
Erling Haaland,3.0,0.0,2.0,1.0,3.0,14.0,0.0,1.0,5.0,4.0,...,79.0,33.0,83.0,21.0,2.0,1.0,2.37,84.0,26.0,1.47
Kevin De Bruyne,28.0,3.0,12.0,13.0,9.0,10.0,0.0,14.0,39.0,25.0,...,76.0,28.0,84.0,16.0,4.0,3.0,2.31,69.0,23.0,0.37
İlkay Gündoğan,30.0,7.0,13.0,10.0,20.0,13.0,0.0,12.0,33.0,21.0,...,76.0,27.0,85.0,17.0,4.0,7.0,2.39,71.0,20.0,1.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Rildo,19.0,7.0,7.0,5.0,3.0,6.0,0.0,7.0,17.0,10.0,...,36.0,7.0,67.0,1.0,16.0,7.0,0.61,9.0,19.0,-0.21
Diogo Calila,15.0,6.0,8.0,1.0,9.0,11.0,1.0,8.0,11.0,3.0,...,43.0,6.0,72.0,4.0,9.0,7.0,0.40,2.0,13.0,-0.73
Eulânio Ângelo Chipela Gomes,11.0,5.0,1.0,5.0,4.0,9.0,0.0,5.0,6.0,1.0,...,52.0,5.0,86.0,4.0,6.0,1.0,0.45,8.0,20.0,-1.15
Anderson Carvalho,19.0,5.0,12.0,2.0,6.0,13.0,0.0,8.0,19.0,11.0,...,45.0,5.0,73.0,1.0,6.0,1.0,0.91,4.0,7.0,0.47


### Scale

In [2]:
from sklearn.preprocessing import StandardScaler, Normalizer

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df_scaled.shape

(2942, 97)

### Create latent feature vector with Autoencoder
#### Dataloader

In [3]:
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

torch.manual_seed(SEED)
np.random.seed(SEED)

# Convert to PyTorch tensors
data_tensor = torch.tensor(df_scaled, dtype=torch.float32)

# Create a DataLoader
batch_size = 32
dataset = TensorDataset(data_tensor, data_tensor)  # Input and target are the same for autoencoder
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define model parameters
input_dim = df_scaled.shape[1]  # Input dimensions = number of features
encoding_dim = 50  # Size of the bottleneck layer

#### Model, Criterion and Optimizer

In [4]:
import autoencoder
import torch.nn as nn
import torch.optim as optim

# Create model
model = autoencoder.Autoencoder(input_dim, encoding_dim)

# Loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error loss for reconstruction
optimizer = optim.Adam(model.parameters(), lr=0.001)

#### Training

In [5]:
model = autoencoder.train_model(
    model=model,
    dataloader=dataloader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=50
)

Epoch [10/50], Loss: 0.5978
Epoch [20/50], Loss: 0.5264
Epoch [30/50], Loss: 0.6447
Epoch [40/50], Loss: 0.5236
Epoch [50/50], Loss: 0.5846


#### Bottleneck Layer - latent vector - embeddings

In [6]:
# Pushing whole data set through network to get latent vector
embeddings = model.get_embeddings(data_tensor)

In [20]:
embeddings.shape

(2942, 50)

### Top k Similar Players - Cosine

In [22]:
index = player_info.index.get_loc("Christopher Nkunku")

top_k_similar_players = utils.get_top_k_similar_players(
    embeddings=embeddings,
    query_index=index,
    player_info=player_info,
    top_k=10
)

top_k_similar_players

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Global Pos,Age,Matches Played,Playing Time_Starts,Playing Time_Min,Playing Time_90s,cosine_similarity
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Christopher Nkunku,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",FW,24.0,25.0,20.0,1897.0,21.1,1.0
Ciro Immobile,2022-2023,SeriaA,Lazio,Ciro Immobile,ITA,FW,FW,32.0,31.0,27.0,2219.0,24.7,0.98003
Stephan El Shaarawy,2022-2023,SeriaA,Roma,Stephan El Shaarawy,ITA,"DF,MF",DF,29.0,29.0,14.0,1523.0,16.9,0.954751
Nicolas Jackson,2022-2023,LaLiga,Villarreal,Nicolas Jackson,SEN,FW,FW,21.0,26.0,16.0,1598.0,17.8,0.943562
Karim Benzema,2022-2023,LaLiga,Real Madrid,Karim Benzema,FRA,FW,FW,34.0,24.0,24.0,2038.0,22.6,0.94227
Santiago Giménez,2022-2023,EreDivisie,Feyenoord,Santiago Giménez,MEX,FW,FW,21.0,32.0,21.0,1926.0,21.4,0.933938
Burak Yılmaz,2022-2023,EreDivisie,Fortuna Sittard,Burak Yılmaz,TUR,FW,FW,37.0,26.0,24.0,2218.0,24.6,0.931689
Sadio Mané,2022-2023,Bundesliga,Bayern Munich,Sadio Mané,SEN,"FW,MF",FW,30.0,25.0,18.0,1425.0,15.8,0.931579
Jonathan Ikone,2022-2023,SeriaA,Fiorentina,Jonathan Ikone,FRA,FW,FW,24.0,33.0,24.0,2007.0,22.3,0.929005
Rafa Silva,2022-2023,PrimeiraLiga,Benfica,Rafa Silva,POR,"MF,FW",MF,29.0,28.0,26.0,2174.0,24.2,0.928688


### Top k Similar Players - Euclidean

In [16]:
index = player_info.index.get_loc("Christopher Nkunku")

top_k_similar_players = utils.get_top_k_similar_players(
    embeddings=embeddings,
    query_index=index,
    player_info=player_info,
    top_k=10,
    distance_metric='euclidean'
)

top_k_similar_players

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Global Pos,Age,Matches Played,Playing Time_Starts,Playing Time_Min,Playing Time_90s,euclidean_distance
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Christopher Nkunku,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",FW,24.0,25.0,20.0,1897.0,21.1,0.0
Ciro Immobile,2022-2023,SeriaA,Lazio,Ciro Immobile,ITA,FW,FW,32.0,31.0,27.0,2219.0,24.7,6.653714
Nicolas Jackson,2022-2023,LaLiga,Villarreal,Nicolas Jackson,SEN,FW,FW,21.0,26.0,16.0,1598.0,17.8,10.904605
Karim Benzema,2022-2023,LaLiga,Real Madrid,Karim Benzema,FRA,FW,FW,34.0,24.0,24.0,2038.0,22.6,11.446707
Santiago Giménez,2022-2023,EreDivisie,Feyenoord,Santiago Giménez,MEX,FW,FW,21.0,32.0,21.0,1926.0,21.4,11.740934
Burak Yılmaz,2022-2023,EreDivisie,Fortuna Sittard,Burak Yılmaz,TUR,FW,FW,37.0,26.0,24.0,2218.0,24.6,11.840431
Sadio Mané,2022-2023,Bundesliga,Bayern Munich,Sadio Mané,SEN,"FW,MF",FW,30.0,25.0,18.0,1425.0,15.8,11.915724
Iuri Medeiros,2022-2023,PrimeiraLiga,Braga,Iuri Medeiros,POR,"MF,FW",MF,28.0,30.0,29.0,2059.0,22.9,12.389694
Jonathan Ikone,2022-2023,SeriaA,Fiorentina,Jonathan Ikone,FRA,FW,FW,24.0,33.0,24.0,2007.0,22.3,12.658566
Darwin Núñez,2022-2023,EPL,Liverpool,Darwin Núñez,URU,FW,FW,23.0,29.0,19.0,1695.0,18.8,12.98184
