### Load data

In [287]:
import pandas as pd 

# load and set index
df = pd.read_csv('../data/fbref/player_stats.csv')
df = df.set_index(df['Player'], drop=True)
df = df.fillna(0)

# seperate standard info and features
l = ['unique_id','Season','League','Team','Player',	'Nation',	'Pos',	'Age', 'Global Pos',
     "Playing Time_Starts",	"Playing Time_Min",	"Playing Time_90s"]
player_info = df[l]
features = [col for col in df.columns if col not in l]
df = df[features]

# Set seed for reproducibility
SEED = 42



### Scale

In [288]:
from sklearn.preprocessing import StandardScaler, Normalizer

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [289]:
df_scaled.shape

(4803, 163)

### PCA

In [290]:
from sklearn.decomposition import PCA

pca = PCA(n_components=25)  # Retain 95% of variance
df_pca = pca.fit_transform(df_scaled)

### Autoencoder

In [291]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from torch.utils.data import DataLoader, TensorDataset

torch.manual_seed(SEED)
np.random.seed(SEED)

# Convert to PyTorch tensors
data_tensor = torch.tensor(df_scaled, dtype=torch.float32)

# Create a DataLoader
batch_size = 32
dataset = TensorDataset(data_tensor, data_tensor)  # Input and target are the same for autoencoder
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [292]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, encoding_dim)  # Bottleneck layer
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()  # Output layer (reconstruction)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
# Define model parameters
input_dim = df_scaled.shape[1]  # Input dimensions = number of features
encoding_dim = 50  # Size of the bottleneck layer

# Instantiate the model
model = Autoencoder(input_dim, encoding_dim)

### Criterion and Optimizer

In [293]:
# Loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error loss for reconstruction
optimizer = optim.Adam(model.parameters(), lr=0.001)


### Training

In [294]:
# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    for data in dataloader:
        inputs, _ = data  # Inputs are the data
        optimizer.zero_grad()

        # Forward pass: encode-decode
        outputs = model(inputs)
        loss = criterion(outputs, inputs)  # Reconstruction loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [10/50], Loss: 0.7015
Epoch [20/50], Loss: 1.4862
Epoch [30/50], Loss: 0.3624
Epoch [40/50], Loss: 0.2940
Epoch [50/50], Loss: 0.2606


### Bottleneck Layer

In [295]:
# Extract reduced embeddings
with torch.no_grad():  # No need to track gradients for inference
    embeddings = model.encoder(data_tensor).numpy()

# Normalize the embeddings (L2 normalization)
embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)


### Compute cosine

In [296]:
# Let's say the first row is the query point
query_embedding = embeddings_norm[795]

# Compute cosine similarities between the query and the rest
cosine_similarities = np.dot(embeddings_norm, query_embedding)

# Get the top 10 most similar samples
top_10_similar = np.argsort(cosine_similarities)[-20:]
top_10_df = player_info.iloc[top_10_similar].copy()
top_10_df['cosine_similarity'] = cosine_similarities[top_10_similar]

display(top_10_df.sort_values('cosine_similarity',ascending=False))

Unnamed: 0_level_0,unique_id,Season,League,Team,Player,Nation,Pos,Age,Global Pos,Playing Time_Starts,Playing Time_Min,Playing Time_90s,cosine_similarity
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Christopher Nkunku,Christopher Nkunku_2022-2023_Bundesliga_RB Lei...,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",24.0,FW,20.0,1897.0,21.1,1.0
Ciro Immobile,Ciro Immobile_2022-2023_SeriaA_Lazio,2022-2023,SeriaA,Lazio,Ciro Immobile,ITA,FW,32.0,FW,27.0,2219.0,24.7,0.982619
Ademola Lookman,Ademola Lookman_2022-2023_SeriaA_Atalanta,2022-2023,SeriaA,Atalanta,Ademola Lookman,NGA,"FW,MF",24.0,FW,20.0,1729.0,19.2,0.958756
Rodrygo,Rodrygo_2022-2023_LaLiga_Real Madrid,2022-2023,LaLiga,Real Madrid,Rodrygo,BRA,"FW,MF",21.0,FW,25.0,2379.0,26.4,0.953164
Burak Yılmaz,Burak Yılmaz_2022-2023_EreDivisie_Fortuna Sittard,2022-2023,EreDivisie,Fortuna Sittard,Burak Yılmaz,TUR,FW,37.0,FW,24.0,2218.0,24.6,0.953157
Karim Benzema,Karim Benzema_2022-2023_LaLiga_Real Madrid,2022-2023,LaLiga,Real Madrid,Karim Benzema,FRA,FW,34.0,FW,24.0,2038.0,22.6,0.951127
Alexandre Lacazette,Alexandre Lacazette_2022-2023_Ligue1_Lyon,2022-2023,Ligue1,Lyon,Alexandre Lacazette,FRA,FW,31.0,FW,34.0,2929.0,32.5,0.947865
Folarin Balogun,Folarin Balogun_2022-2023_Ligue1_Reims,2022-2023,Ligue1,Reims,Folarin Balogun,USA,FW,21.0,FW,34.0,2999.0,33.3,0.947029
Khvicha Kvaratskhelia,Khvicha Kvaratskhelia_2022-2023_SeriaA_Napoli,2022-2023,SeriaA,Napoli,Khvicha Kvaratskhelia,GEO,FW,21.0,FW,30.0,2518.0,28.0,0.946528
Elye Wahi,Elye Wahi_2022-2023_Ligue1_Montpellier,2022-2023,Ligue1,Montpellier,Elye Wahi,FRA,FW,19.0,FW,29.0,2513.0,27.9,0.944847
