In [52]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler, Normalizer

# load and set index
df = pd.read_csv('../data/fbref/player_stats.csv')
df = df.set_index(df['Player'], drop=True)
df = df.fillna(0)

# seperate standard info and features
#l = ['unique_id','Season','League','Team','Player',	'Nation',	'Pos',	'Age', 'Global Pos',
#     "Playing Time_Starts",	"Playing Time_Min",	"Playing Time_90s"]

player_info_cols = ['unique_id','Season', 'League', 'Team', 'Player', 'Nation', 'Pos','Global Pos', 'Age','Matches Played','Playing Time_Starts','Playing Time_Min','Playing Time_90s']


player_info = df[player_info_cols]
features = [col for col in df.columns if col not in player_info_cols]
df = df[features]

# Set seed for reproducibility
SEED = 42

# scale data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

### VAE

In [53]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VAE(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(VAE, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        
        # Latent space parameters
        self.fc_mu = nn.Linear(32, encoding_dim)  # Mean of the latent space
        self.fc_logvar = nn.Linear(32, encoding_dim)  # Log-variance of the latent space
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()  # Output layer (reconstruction)
        )

    def encode(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)  # Compute standard deviation
        eps = torch.randn_like(std)  # Random normal noise
        z = mu + eps * std  # Reparameterization trick
        return z

    def decode(self, z):
        return self.decoder(z)
    
    def forward(self, x):
        # Encoding step
        mu, logvar = self.encode(x)
        
        # Reparameterization trick
        z = self.reparameterize(mu, logvar)
        
        # Decoding step
        recon_x = self.decode(z)
        
        return recon_x, mu, logvar

# Loss function for VAE
def vae_loss_function(recon_x, x, mu, logvar):
    # Reconstruction loss (can use MSE or BCE, depending on the data)
    recon_loss = F.mse_loss(recon_x, x, reduction='sum')
    
    # KL Divergence loss
    kld_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    
    return recon_loss + kld_loss


def train_model(model, dataloader, criterion, optimizer, num_epochs=50):
    model.train()  # Set model to training mode
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        
        for data in dataloader:
            inputs = data[0].to('cpu')  # Assuming inputs are in the first index of the batch
            
            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            recon_batch, mu, logvar = model(inputs)
            
            # Calculate loss
            recon_loss = criterion(recon_batch, inputs)
            kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
            
            loss = recon_loss + kl_loss
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Accumulate the loss
            running_loss += loss.item()
        
             
        # Logging every log_interval epochs
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
    
    return model

### Dataloader, Criterion, Optimizer

In [54]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from torch.utils.data import DataLoader, TensorDataset

torch.manual_seed(SEED)
np.random.seed(SEED)

# Convert to PyTorch tensors
data_tensor = torch.tensor(df_scaled, dtype=torch.float32)

# Create a DataLoader
batch_size = 32
dataset = TensorDataset(data_tensor, data_tensor)  # Input and target are the same for autoencoder
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define model parameters
input_dim = df_scaled.shape[1]  # Input dimensions = number of features
encoding_dim = 100  # Size of the bottleneck layer

# model
model = VAE(input_dim, encoding_dim)

# Loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error loss for reconstruction
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Training

In [55]:
model = train_model(model,dataloader,criterion,optimizer,50)

Epoch [10/50], Loss: 0.3304
Epoch [20/50], Loss: 0.4458
Epoch [30/50], Loss: 0.3200
Epoch [40/50], Loss: 0.3208
Epoch [50/50], Loss: 1.0484


### Embeddings from bottleneck layer

In [56]:
# Extract reduced embeddings
with torch.no_grad():  # No need to track gradients for inference
    embeddings = model.encoder(data_tensor).numpy()

embeddings.shape, embeddings

((4803, 32),
 array([[0.        , 0.        , 0.        , ..., 0.297474  , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.23543501, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.22022833, 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.22417916, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.22412325, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.22417916, 0.        ,
         0.        ]], dtype=float32))

In [58]:
embeddings[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.21202084, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.18621151,
       0.        , 0.        , 0.        , 0.        , 0.20859191,
       0.3275454 , 0.        , 0.        , 0.        , 0.297474  ,
       0.        , 0.        ], dtype=float32)

In [57]:
# Normalize the embeddings (L2 normalization)
embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Let's say the first row is the query point
query_embedding = embeddings_norm[795]

# Compute cosine similarities between the query and the rest
cosine_similarities = np.dot(embeddings_norm, query_embedding)

# Get the top 10 most similar samples
top_10_similar = np.argsort(cosine_similarities)[-20:]
top_10_df = player_info.iloc[top_10_similar].copy()
top_10_df['cosine_similarity'] = cosine_similarities[top_10_similar]

top_10_df = top_10_df.sort_values('cosine_similarity',ascending=False)
display(top_10_df.drop(columns='unique_id'))

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Global Pos,Age,Matches Played,Playing Time_Starts,Playing Time_Min,Playing Time_90s,cosine_similarity
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Christopher Nkunku,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",FW,24.0,25.0,20.0,1897.0,21.1,1.0
Álex Collado,2022-2023,LaLiga,Elche,Álex Collado,ESP,MF,MF,23.0,16.0,8.0,729.0,8.1,0.999999
Wissam Ben Yedder,2022-2023,Ligue1,Monaco,Wissam Ben Yedder,FRA,FW,FW,31.0,32.0,28.0,2114.0,23.5,0.999999
Stefan Ortega,2022-2023,EPL,Manchester City,Stefan Ortega,GER,GK,GK,29.0,3.0,3.0,270.0,3.0,0.999998
Lukas Klostermann,2022-2023,Bundesliga,RB Leipzig,Lukas Klostermann,GER,DF,DF,26.0,15.0,10.0,907.0,10.1,0.999998
Mihai Dobre,2022-2023,PrimeiraLiga,Famalicão,Mihai Dobre,ROU,"FW,DF",FW,23.0,17.0,11.0,939.0,10.4,0.999997
Antoine Mendy,2022-2023,Ligue1,Nice,Antoine Mendy,FRA,DF,DF,18.0,11.0,7.0,518.0,5.8,0.999997
Christian Eriksen,2022-2023,EPL,Manchester Utd,Christian Eriksen,DEN,MF,MF,30.0,28.0,25.0,2048.0,22.8,0.999997
Bryan Mbeumo,2022-2023,EPL,Brentford,Bryan Mbeumo,CMR,FW,FW,22.0,38.0,36.0,2906.0,32.3,0.999997
Anthony Jung,2022-2023,Bundesliga,Werder Bremen,Anthony Jung,GER,"DF,FW",DF,30.0,34.0,32.0,2617.0,29.1,0.999997
