#### Notebook for straightforward masked players prediction

I highly recommend to run this on google colab

1. First connect the notebooks to your google drive to load data and save outputs directly there.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import json
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import Trainer, TrainingArguments
from transformers.modeling_outputs import MaskedLMOutput

from safetensors.torch import load_file

from prettytable import PrettyTable
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


2. Prepare the data

In [2]:
import pandas as pd

df_input = pd.read_csv('../dataset/statsbomb/df_raw_counts_players_matches.csv') # consider replacing by the right path in your folder
df_input

Unnamed: 0,player_name,team_name,competition_name,season_name,match_id,is_aligned,position_id,position_name,pass_total,pass_cross,...,foul_won_total,foul_won_penalty,foul_committed_total,foul_committed_penalty,foul_committed_yellow_card,foul_committed_red_card,goalkeeper_goal_conceded,goalkeeper_save,goalkeeper_shot_faced,counterpress_total
0,Thierry Henry,Arsenal,Premier League,2003/2004,3749052,1,23,Left Center Forward,40,3,...,3,0,1,0,0,0,0,0,0,4
1,Dennis Bergkamp,Arsenal,Premier League,2003/2004,3749052,1,21,Right Center Forward,22,1,...,0,0,3,0,1,0,0,0,0,5
2,Eduardo César Daude Gaspar,Arsenal,Premier League,2003/2004,3749052,1,10,Left Defensive Midfield,72,1,...,3,0,3,0,0,0,0,0,0,14
3,Ashley Cole,Arsenal,Premier League,2003/2004,3749052,1,5,Left Back,68,2,...,0,0,0,0,0,0,0,0,0,8
4,Sulzeer Jeremiah ''Sol' Campbell,Arsenal,Premier League,2003/2004,3749052,1,4,Left Center Back,42,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68302,James Philip Milner,Liverpool,Champions League,2018/2019,22912,1,11,Right Midfield,7,0,...,0,0,1,0,0,0,0,0,0,1
68303,Lucas Rodrigues Moura da Silva,Tottenham Hotspur,Champions League,2018/2019,22912,1,11,Right Midfield,5,0,...,0,0,0,0,0,0,0,0,0,0
68304,Eric Dier,Tottenham Hotspur,Champions League,2018/2019,22912,1,14,Left Center Midfield,10,1,...,1,0,0,0,0,0,0,0,0,0
68305,Fernando Llorente Torres,Tottenham Hotspur,Champions League,2018/2019,22912,1,21,Right Center Forward,4,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
LABEL2PLAYER_NAME = dict(enumerate(df_input['player_name'].astype('category').cat.categories))
PLAYER_NAME2LABEL = {v: k for k, v in LABEL2PLAYER_NAME.items()}

LABEL2TEAM_NAME = dict(enumerate(df_input['team_name'].astype('category').cat.categories))
TEAM_NAME2LABEL = {v: k for k, v in LABEL2TEAM_NAME.items()}

print(len(PLAYER_NAME2LABEL))
print(len(TEAM_NAME2LABEL))

5106
141


In [5]:
PLAYER_NAME2LABEL['MASK'] = 5106
PLAYER_NAME2LABEL['PAD'] = 5107
LABEL2PLAYER_NAME[5106] = 'MASK'
LABEL2PLAYER_NAME[5107] = 'PAD'

TEAM_NAME2LABEL['PAD'] = 141
LABEL2TEAM_NAME[141] = 'PAD'

In [6]:
TEAM_MAX_LENGTH = 18
ID_COLUMNS = ['player_name', 'team_name', 'competition_name', 'season_name', 'match_id', 'position_id', 'position_name']
FORM_STATS_SIZE = df_input.shape[1]-len(ID_COLUMNS)-1 # -1 remove is_aligned

MAX_PLAYER_IDX = df_input.player_name.nunique()-1
MAX_TEAM_IDX = df_input.team_name.nunique()-1
MAX_POSITION_IDX = 24

PLAYER_PAD_TOKEN_ID=MAX_PLAYER_IDX+1
PLAYER_MASK_TOKEN_ID=MAX_PLAYER_IDX+2
TEAM_PAD_TOKEN_ID=MAX_TEAM_IDX+1
POSITION_PAD_TOKEN_ID = MAX_POSITION_IDX+1

PLAYERS_BANK_SIZE = MAX_PLAYER_IDX+1+1
TEAMS_BANK_SIZE = MAX_TEAM_IDX+1
POSITION_BANK_SIZE = MAX_POSITION_IDX+1

In [7]:
class DataCollatorMaskedPM(Dataset):
    def __init__(self,
                 df_input,
                 player_pad_token_id=PLAYER_PAD_TOKEN_ID,
                 player_mask_token_id=PLAYER_MASK_TOKEN_ID,
                 team_pad_token_id=TEAM_PAD_TOKEN_ID,
                 position_pad_toekn_id = POSITION_PAD_TOKEN_ID,
                 player_name2label=PLAYER_NAME2LABEL,
                 team_name2label=TEAM_NAME2LABEL,
                 mask_percentage = 0.25):

        self.df_input = df_input
        self.player_pad_token_id = player_pad_token_id
        self.player_mask_token_id = player_mask_token_id
        self.team_pad_token_id = team_pad_token_id
        self.position_pad_token_id = position_pad_toekn_id
        self.player_name2label = player_name2label
        self.team_name2label = team_name2label
        self.max_length = 2*TEAM_MAX_LENGTH
        self.mask_percentage = mask_percentage

    def __len__(self):
        return self.df_input.match_id.nunique()

    def mask_players(self, match_input_player_id, match_output_player_id, match_input_form_stats, match_attention_mask, player_mask_token_id, mask_percentage):

        # mask between only the players tokens, not the padding tokens
        maskable_idx = np.where(match_attention_mask == 1)[0]

        number_masked_players = int(len(maskable_idx)*mask_percentage)

        masked_idx = np.random.choice(maskable_idx, number_masked_players, replace=False)
        non_masked_idx = [idx for idx in range(len(match_input_player_id)) if idx not in masked_idx]

        match_input_player_id[masked_idx] = player_mask_token_id
        match_input_form_stats[masked_idx] = 0
        match_output_player_id[non_masked_idx] = -100

        return match_input_player_id, match_output_player_id, match_input_form_stats

    def __getitem__(self, idx):

        """
        idx is the idx of an element in the dataset, a number between 0 and len(dataset)
        """
        #print(f"idx: {idx}")
        match_id = self.df_input.match_id.unique()[idx]
        match_input = self.df_input[self.df_input.match_id == match_id]

        match_teams = match_input.team_name.unique()
        match_input = pd.concat([match_input[match_input.team_name == match_teams[i]] for i in range(2)], ignore_index=True) # ensure that the players name in the same order as the input

        if len(match_teams) != 2:
            print (f"Error: match {match_id} contains {len(match_teams)} teams !")
            return None

        # encode the player_name to player_id
        match_input_player_name = match_input.player_name
        match_input_player_id = np.array([self.player_name2label[player_name] for player_name in match_input_player_name])
        match_input_player_id = np.pad(match_input_player_id, (0, self.max_length-len(match_input_player_id)), mode='constant', constant_values=self.player_pad_token_id)

        match_output_player_name = match_input.player_name
        match_output_player_id = np.array([self.player_name2label[player_name] for player_name in match_output_player_name])
        match_output_player_id = np.pad(match_output_player_id, (0, self.max_length-len(match_output_player_id)), mode='constant', constant_values=self.player_pad_token_id)

        # encode the team_name to team_id
        match_input_team_name = match_input.team_name
        match_input_team_id = [self.team_name2label[team_name] for team_name in match_input_team_name]
        match_input_team_id = np.pad(match_input_team_id, (0, self.max_length-len(match_input_team_id)), mode='constant', constant_values=self.team_pad_token_id)

        # spatial position id
        match_input_position_id = np.array(match_input.position_id)
        match_input_position_id = np.pad(match_input_position_id, (0, self.max_length-len(match_input_position_id)), mode='constant', constant_values=self.position_pad_token_id)

        # remove the id columns
        match_input = match_input.drop(columns=ID_COLUMNS, axis=1)

        # add the attention mask depending on if the player is playing or not
        match_attention_mask = np.array(match_input.is_aligned)
        match_attention_mask = np.pad(match_attention_mask, (0, self.max_length-len(match_attention_mask)), mode='constant', constant_values=0)

        match_input = match_input.drop(columns=['is_aligned'], axis=1)

        # prepare the players form stats (TPE) for each player
        match_input_form_stats = np.array(match_input)
        match_input_form_stats = np.pad(match_input_form_stats, ((0, self.max_length-match_input_form_stats.shape[0]), (0, 0)), mode='constant', constant_values=0)

        # masking strategy, 25% of the players that are playing are masked, means 7/28 players per match
        match_input_player_id, match_output_player_id, match_input_form_stats = self.mask_players(match_input_player_id, match_output_player_id, match_input_form_stats, match_attention_mask,
                                                                                                  self.player_mask_token_id, self.mask_percentage)
        # return the dict of input and output data
        sample = {
                  'input_ids': torch.tensor(match_input_player_id, dtype=torch.long),
                  'labels': torch.tensor(match_output_player_id, dtype=torch.long),
                  'position_id': torch.tensor(match_input_position_id, dtype=torch.long),
                  'team_id': torch.tensor(match_input_team_id, dtype=torch.long),
                  'form_stats': torch.tensor(match_input_form_stats).float(),
                  'attention_mask': torch.tensor(match_attention_mask, dtype=torch.long),
                  }

        return sample


In [8]:
def custom_collate_fn(batch):
    # Filtrer les éléments None
    batch = [item for item in batch if item is not None]
    return torch.utils.data.dataloader.default_collate(batch)

batch_size = 64

my_dataset = DataCollatorMaskedPM(df_input)
my_dataloader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn, drop_last = True)

print(len(my_dataloader))

38


In [9]:
repeat = 2
all_batches = []
for count in tqdm(range(repeat)):
  # there will be a shuffling at each repetition and a different masking for each batch in the dataloader
  for batch in my_dataloader:
      all_batches.append(batch)

print("\n", batch['input_ids'][10])
print(batch['labels'][10])
print(len(all_batches))

100%|██████████| 2/2 [00:37<00:00, 18.55s/it]


 tensor([5107, 1118, 4654, 3874, 1690, 2222, 5107, 5107, 1629, 4056, 3860, 1258,
        3742, 2827, 3902, 2100,  778, 2753, 4239, 5107, 3043, 1512, 5107, 1064,
        5107,  662, 1015, 5107, 5106, 5106, 5106, 5106, 5106, 5106, 5106, 5106])
tensor([4145, -100, -100, -100, -100, -100, 4600, 2786, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, 3448, -100, -100,  993, -100,
         456, -100, -100, 3642, -100, -100, -100, -100, -100, -100, -100, -100])
76





In [10]:
class PreprocessedDataCollatorMaskedPM(Dataset):

    def __init__(self, all_batches):
      self.data = all_batches

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        """
        idx an already preprocessed batch
        """

        return self.data[idx]

In [11]:
dev_size = int(0.05*len(all_batches))

np.random.seed(42)
dev_batches_idx = np.random.choice(range(len(all_batches)), dev_size, replace=False)
print(dev_batches_idx)
train_batches_idx = [idx for idx in range(len(all_batches)) if idx not in dev_batches_idx]

dev_batches = [all_batches[idx] for idx in dev_batches_idx]
train_batches = [all_batches[idx] for idx in train_batches_idx]

batch_size = 1

dataset_train = PreprocessedDataCollatorMaskedPM(train_batches)
dataset_val = PreprocessedDataCollatorMaskedPM(dev_batches)

dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=False)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

print(len(dataloader_train))
print(len(dataloader_val))

[ 4 35 10]
73
3


In [12]:
# observed the inputs shape
for batch in tqdm(dataloader_train):
    for key in batch:
        print(key, batch[key].shape)
    break

  0%|          | 0/73 [00:00<?, ?it/s]

input_ids torch.Size([1, 64, 36])
labels torch.Size([1, 64, 36])
position_id torch.Size([1, 64, 36])
team_id torch.Size([1, 64, 36])
form_stats torch.Size([1, 64, 36, 39])
attention_mask torch.Size([1, 64, 36])





##### model

In [13]:
class PlayerSelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(PlayerSelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size//heads

        assert(self.head_dim*heads == embed_size), "Embed size needs to be divisible by heads"

        # compute the values, keys and queries for all heads
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)

    def forward(self, values, keys, queries, mask=None):
        N = queries.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]

        # split embedding into self.heads pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2).expand(N, 1, query_len, key_len)
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy/ (self.head_dim ** 0.5), dim = 3) # normalize accross the key_len

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.heads*self.head_dim)

        out = self.fc_out(out)

        return out, attention

class PlayerTransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion) :
        super(PlayerTransformerBlock, self).__init__()
        self.attention = PlayerSelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, embed_size*forward_expansion),
            nn.ReLU(),
            nn.Linear(forward_expansion*embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask=None):
        attention, attention_matrix = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out, attention_matrix


class PlayerEncoder(nn.Module):
    def __init__(self, embed_size, num_layers, heads, forward_expansion, dropout, form_stats_size,
                  players_bank_size, teams_bank_size, n_positions, use_teams_embeddings = False):

        super(PlayerEncoder, self).__init__()

        self.embed_size = embed_size
        self.use_teams_embeddings = use_teams_embeddings

        self.form_embeddings = nn.Linear(form_stats_size, embed_size)
        self.players_embeddings = nn.Embedding(players_bank_size+1, embed_size, padding_idx = players_bank_size)
        if self.use_teams_embeddings:
            self.teams_embeddings = nn.Embedding(teams_bank_size+1, embed_size, padding_idx=teams_bank_size)
        self.positions_embeddings = nn.Embedding(n_positions+1, embed_size, padding_idx = n_positions)


        self.layers = nn.ModuleList([PlayerTransformerBlock(embed_size, heads, dropout, forward_expansion)
                                     for _ in range(num_layers)])

        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, player_id, position_id, team_id, form_stats, attention_mask):

        if self.use_teams_embeddings:
            out = self.dropout(self.relu(self.players_embeddings(player_id))+\
                            self.form_embeddings(form_stats)+\
                            self.teams_embeddings(team_id)+\
                            self.positions_embeddings(position_id))
        
        else:
            out = self.dropout(self.relu(self.players_embeddings(player_id))+\
                            self.form_embeddings(form_stats)+\
                            self.positions_embeddings(position_id))

        attention_matrices = []
        for layer in self.layers:
            out, attention_matrix = layer(out, out, out, attention_mask)
            attention_matrices.append(attention_matrix)

        return out, attention_matrices

class TransformerForMaskedPM(nn.Module):
    def __init__(self, embed_size, num_layers, heads, forward_expansion, dropout, form_stats_size= FORM_STATS_SIZE,
                  players_bank_size = PLAYERS_BANK_SIZE, teams_bank_size = TEAMS_BANK_SIZE,
                  n_positions = POSITION_BANK_SIZE):
        super(TransformerForMaskedPM, self).__init__()

        self.players_bank_size = players_bank_size

        self.player_encoder = PlayerEncoder(embed_size, num_layers, heads, forward_expansion, dropout, form_stats_size,
                                            players_bank_size, teams_bank_size, n_positions)

        self.decoder = nn.Linear(embed_size, players_bank_size)

        self.criterion = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, labels, position_id, team_id, form_stats, attention_mask):

        input_ids = input_ids.squeeze(0)
        labels = labels.squeeze(0)
        position_id = position_id.squeeze(0)
        team_id = team_id.squeeze(0)
        form_stats = form_stats.squeeze(0)
        attention_mask = attention_mask.squeeze(0)

        players_embeddings, attention_matrices = self.player_encoder(input_ids, position_id, team_id, form_stats, attention_mask)

        output = self.decoder(players_embeddings)
        
        loss = self.criterion(output.view(-1, self.players_bank_size), labels.view(-1))


        return MaskedLMOutput(loss = loss,
                              logits = output,
                              hidden_states = players_embeddings,
                              attentions=attention_matrices)


def count_parameters(model, print_table = False):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    if print_table:
        print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

In [14]:
model = TransformerForMaskedPM(embed_size=128,
                                num_layers=1,
                                heads=2,
                                forward_expansion=4,
                                dropout=0.05)

In [15]:
count_parameters(model, print_table=True)

+--------------------------------------------------+------------+
|                     Modules                      | Parameters |
+--------------------------------------------------+------------+
|      player_encoder.form_embeddings.weight       |    4992    |
|       player_encoder.form_embeddings.bias        |    128     |
|     player_encoder.players_embeddings.weight     |   653824   |
|    player_encoder.positions_embeddings.weight    |    3328    |
| player_encoder.layers.0.attention.values.weight  |    4096    |
|  player_encoder.layers.0.attention.keys.weight   |    4096    |
| player_encoder.layers.0.attention.queries.weight |    4096    |
| player_encoder.layers.0.attention.fc_out.weight  |   16384    |
|  player_encoder.layers.0.attention.fc_out.bias   |    128     |
|       player_encoder.layers.0.norm1.weight       |    128     |
|        player_encoder.layers.0.norm1.bias        |    128     |
|       player_encoder.layers.0.norm2.weight       |    128     |
|        p

1482099

### Training

In [16]:
def compute_metrics(eval_pred):

    model_output, labels = eval_pred # labels: [len(dev_batches) 14, sample_bs 256, n_players 80]
    pred, players_embeddings, attention_matrices = model_output # pred: [14*256, n_players, players_bank_size]

    #print(labels.shape, pred.shape)

    len_dev_batches, bs_sample, n_players = labels.shape

    labels = labels.reshape(len_dev_batches*bs_sample, n_players)
    labels = labels.reshape(len_dev_batches*bs_sample*n_players,)

    pred = pred.reshape(pred.shape[0]*n_players, -1)

    # remove the padding tokens
    mask_non_pad_players = labels != -100
    labels = labels[mask_non_pad_players]
    pred = pred[mask_non_pad_players]

    #print(labels.shape)
    #print(pred.shape)

    # find the most likely predicted player
    pred_top1_idx = np.argmax(pred, axis=1)

    # find the top 3 most likely predicted players
    pred_top3_idx = np.argsort(pred, axis=1)[:, -3:]

    # compute the model top 1 accuracy
    accuracy_top1 = (labels==pred_top1_idx).mean()

    # compute the model top3 accuracy
    accuracy_top3 = 0
    for label, pred_top3 in zip(labels, pred_top3_idx):
        if label in pred_top3:
            accuracy_top3 += 1
    accuracy_top3 /= len(labels)

    outputs = {'accuracy_top1': accuracy_top1,
               'accuracy_top3': accuracy_top3}


    return outputs


In [None]:
training_args = TrainingArguments(
    output_dir='outputs/masked_players_prediction',
    num_train_epochs=2000, # 342K steps
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    report_to="tensorboard",
    learning_rate=1e-4,
    warmup_ratio=0,
    evaluation_strategy="steps",
    eval_steps=100, # every 50 epochs
    logging_strategy="steps",
    logging_steps=100, # every 50 epochs
    save_strategy = "steps",
    save_steps = 42750, # every 250 epochs
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

#### Save embeddings

In [19]:
"""load my pretrained checkpoint (the one behing RisB-Profiler), trained for 2000 epochs, 
342K steps on the statsbomb dataset without the teams embeddings 
"""
state_dict = load_file('pretrained_ckpt/model.safetensors')

model.load_state_dict(state_dict)

<All keys matched successfully>

In [24]:
positions_embeddings = model.state_dict()['player_encoder.positions_embeddings.weight'].numpy()[:-1] # remove the padding token
players_embeddings = model.state_dict()['decoder.weight'].numpy()[:-1] # remove the padding token

print(positions_embeddings.shape, players_embeddings.shape)

np.save('pretrained_ckpt/players_embeddings_1l128d2h_wo_teams_emb_statsbomb_2454games.npy', players_embeddings)
np.save('pretrained_ckpt/positions_embeddings_1l128d2h_wo_teams_emb_statsbomb_2454games.npy', positions_embeddings)

(25, 128) (5106, 128)
