<a href="https://colab.research.google.com/github/doctorsmylie/mtg-draft-agent/blob/main/testing/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing the transformers

## Try to train transformer models with draft data.

## First, try sequence to sequence.

## Then, try choosing pick given pack/pool.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import shutil
import zipfile

from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer

import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [3]:
# Folder containing all the data
folder='/content/drive/MyDrive/MTGdraft'

# Expansion code
expansion = 'DSK'

In [4]:
draftfilename ='draft_data_public.DSK.PremierDraft.csv.gz'
draft_file = Path(folder, expansion, draftfilename)

draftdata=pd.read_csv(draft_file,compression='gzip', nrows=10000)

In [5]:
draftdata.head()

Unnamed: 0,expansion,event_type,draft_id,draft_time,rank,event_match_wins,event_match_losses,pack_number,pick_number,pick,...,pool_Waltz of Rage,pool_Wary Watchdog,pool_Wickerfolk Thresher,pool_Wildfire Wickerfolk,pool_Winter's Intervention,"pool_Winter, Misanthropic Guide",pool_Withering Torment,"pool_Zimone, All-Questioning",user_n_games_bucket,user_game_win_rate_bucket
0,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,diamond,1,3,0,0,Infernal Phantom,...,0,0,0,0,0,0,0,0,500,0.58
1,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,diamond,1,3,0,1,Floodpits Drowner,...,0,0,0,0,0,0,0,0,500,0.58
2,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,diamond,1,3,0,2,Fear of Being Hunted,...,0,0,0,0,0,0,0,0,500,0.58
3,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,diamond,1,3,0,3,Fear of Being Hunted,...,0,0,0,0,0,0,0,0,500,0.58
4,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,diamond,1,3,0,4,Undead Sprinter,...,0,0,0,0,0,0,0,0,500,0.58


# Part I: Predict next pick based on sequence of picks?

In [5]:
# Create a list to store the pick sequences
pick_sequences = []

# Group the data by draft_id and iterate through each draft
for draft_id, draft_df in draftdata.groupby('draft_id'):
    # Sort the picks within each draft by pick number
    draft_df = draft_df.sort_values(by='pick_number')

    # Extract the pick for each row in the draft
    picked_cards = draft_df['pick'].tolist()

    # Add the draft_id and the sequence of picked cards to the list
    pick_sequences.append({'draft_id': draft_id, 'pick_sequence': picked_cards})

# Create a new DataFrame from the list of pick sequences
pick_sequence_df = pd.DataFrame(pick_sequences)

# Display the new DataFrame
display(pick_sequence_df.head())

Unnamed: 0,draft_id,pick_sequence
0,0041b2ed5a16424ebd63fcbede00776f,"[Patchwork Beastie, Scorching Dragonfire, Beas..."
1,004fc7f51d7f47fe92267908072ae788,"[Rip, Spawn Hunter, Defiant Survivor, Popular ..."
2,00c08bf90a7c44d2a1d2daf748cedd54,"[Split Up, Razorkin Needlehead, Optimistic Sca..."
3,02821de13f12445f87366450bdee588e,"[Hedge Shredder, Waltz of Rage, Glassworks // ..."
4,02ae214587434247a6e3d07ffe428d47,"[Cursed Windbreaker, Silent Hallcreeper, Endur..."


# Preparation and Training I
Prepare the data in the dataframe `draftdata` for training a machine learning model to predict card picks. The data should be tokenized, sequences should be padded or truncated, and the data should be split into training, validation, and testing sets.

## Tokenization

### Subtask:
Convert the card names (strings) in each pick sequence into numerical representations (tokens) that the model can process. This might involve creating a vocabulary of all unique card names and mapping each card to an integer ID.


**Reasoning**:
Extract unique card names, create a vocabulary including a padding token, and map the card names in the pick sequences to their corresponding integer IDs.



In [6]:
# Extract all unique card names
all_picks = [card for sequence in pick_sequence_df['pick_sequence'] for card in sequence]
unique_cards = sorted(list(set(all_picks)))

# Create a vocabulary with a padding token
card_to_id = {'<PAD>': 0}
for i, card in enumerate(unique_cards):
    card_to_id[card] = i + 1

# Map card names to integer IDs in the pick sequences
pick_sequence_df['input_ids'] = pick_sequence_df['pick_sequence'].apply(lambda sequence: [card_to_id[card] for card in sequence])

# Store the vocabulary
vocabulary = card_to_id

# Display the updated DataFrame with tokenized sequences
display(pick_sequence_df.head())

Unnamed: 0,draft_id,pick_sequence,input_ids
0,0041b2ed5a16424ebd63fcbede00776f,"[Patchwork Beastie, Scorching Dragonfire, Beas...","[178, 206, 14, 277, 17, 206, 167, 276, 80, 22,..."
1,004fc7f51d7f47fe92267908072ae788,"[Rip, Spawn Hunter, Defiant Survivor, Popular ...","[197, 45, 185, 279, 168, 199, 214, 84, 52, 55,..."
2,00c08bf90a7c44d2a1d2daf748cedd54,"[Split Up, Razorkin Needlehead, Optimistic Sca...","[222, 192, 166, 155, 73, 11, 39, 34, 11, 201, ..."
3,02821de13f12445f87366450bdee588e,"[Hedge Shredder, Waltz of Rage, Glassworks // ...","[115, 273, 99, 4, 117, 154, 153, 276, 55, 155,..."
4,02ae214587434247a6e3d07ffe428d47,"[Cursed Windbreaker, Silent Hallcreeper, Endur...","[40, 215, 59, 148, 148, 223, 144, 178, 116, 13..."


## Sequence padding/truncation

### Subtask:
Since machine learning models typically require fixed-length inputs, you'll need to handle sequences of varying lengths. This could involve padding shorter sequences with a special token or truncating longer sequences to a maximum length.


**Reasoning**:
Calculate the maximum sequence length and then pad and truncate the sequences in the 'tokenized_sequence' column to a defined maximum length.



In [None]:
# Determine the maximum sequence length
max_sequence_length = pick_sequence_df['inputs_embeds'].apply(len).max()
print(f"Maximum sequence length: {max_sequence_length}")

# Define a maximum sequence length for padding/truncation
# Choosing a value based on the distribution of sequence lengths (e.g., the calculated max length)
# or a practical constraint (e.g., a fixed number like 40) is a decision.
# For this example, let's use the calculated maximum length for simplicity.
# In a real scenario, you might analyze the distribution of lengths to choose an appropriate value.
defined_max_length = max_sequence_length # Or a fixed number like 40

# Pad and truncate sequences
padded_truncated_sequences = pick_sequence_df['inputs_embeds'].apply(
    lambda sequence: sequence[:defined_max_length] + [card_to_id['<PAD>']] * (defined_max_length - len(sequence))
)

# Update the 'tokenized_sequence' column
pick_sequence_df['inputs_embeds'] = padded_truncated_sequences

# Display the updated DataFrame with padded/truncated sequences
display(pick_sequence_df.head())

Maximum sequence length: 42


Unnamed: 0,draft_id,pick_sequence,tokenized_sequence,inputs,inputs_embeds
0,0041b2ed5a16424ebd63fcbede00776f,"[Patchwork Beastie, Scorching Dragonfire, Beas...","[178, 206, 14, 277, 17, 206, 167, 276, 80, 22,...","[178, 206, 14, 277, 17, 206, 167, 276, 80, 22,...","[178, 206, 14, 277, 17, 206, 167, 276, 80, 22,..."
1,004fc7f51d7f47fe92267908072ae788,"[Rip, Spawn Hunter, Defiant Survivor, Popular ...","[197, 45, 185, 279, 168, 199, 214, 84, 52, 55,...","[197, 45, 185, 279, 168, 199, 214, 84, 52, 55,...","[197, 45, 185, 279, 168, 199, 214, 84, 52, 55,..."
2,00c08bf90a7c44d2a1d2daf748cedd54,"[Split Up, Razorkin Needlehead, Optimistic Sca...","[222, 192, 166, 155, 73, 11, 39, 34, 11, 201, ...","[222, 192, 166, 155, 73, 11, 39, 34, 11, 201, ...","[222, 192, 166, 155, 73, 11, 39, 34, 11, 201, ..."
3,02821de13f12445f87366450bdee588e,"[Hedge Shredder, Waltz of Rage, Glassworks // ...","[115, 273, 99, 4, 117, 154, 153, 276, 55, 155,...","[115, 273, 99, 4, 117, 154, 153, 276, 55, 155,...","[115, 273, 99, 4, 117, 154, 153, 276, 55, 155,..."
4,02ae214587434247a6e3d07ffe428d47,"[Cursed Windbreaker, Silent Hallcreeper, Endur...","[40, 215, 59, 148, 148, 223, 144, 178, 116, 13...","[40, 215, 59, 148, 148, 223, 144, 178, 116, 13...","[40, 215, 59, 148, 148, 223, 144, 178, 116, 13..."


## Splitting data

### Subtask:
Split the prepared data into training, validation, and testing sets.


**Reasoning**:
Split the prepared data into training, validation, and testing sets using train_test_split.



In [None]:
from sklearn.model_selection import train_test_split

# Split into training and temporary sets
train_df, temp_df = train_test_split(pick_sequence_df, test_size=0.2, random_state=42)

# Split temporary set into validation and testing sets
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Display the shapes of the resulting dataframes
print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Testing set shape:", test_df.shape)

Training set shape: (191, 5)
Validation set shape: (24, 5)
Testing set shape: (24, 5)


## Creating dataset and dataloader

### Subtask:
If you plan to use a deep learning framework like PyTorch or TensorFlow, you'll need to create custom Dataset and DataLoader objects to efficiently feed the data to the model during training.


**Reasoning**:
Convert the pandas DataFrames to Hugging Face Dataset objects and combine them into a DatasetDict.



In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

display(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['draft_id', 'pick_sequence', 'tokenized_sequence', 'inputs', 'inputs_embeds', '__index_level_0__'],
        num_rows: 191
    })
    validation: Dataset({
        features: ['draft_id', 'pick_sequence', 'tokenized_sequence', 'inputs', 'inputs_embeds', '__index_level_0__'],
        num_rows: 24
    })
    test: Dataset({
        features: ['draft_id', 'pick_sequence', 'tokenized_sequence', 'inputs', 'inputs_embeds', '__index_level_0__'],
        num_rows: 24
    })
})

## Creating dataset and dataloader

### Subtask:
If you plan to use a deep learning framework like PyTorch or TensorFlow, you'll need to create custom Dataset and DataLoader objects to efficiently feed the data to the model during training.


**Reasoning**:
Create Hugging Face Dataset objects from the dataframes with features and combine them into a DatasetDict.



In [None]:
train_dataset_features = Dataset.from_pandas(train_df_features)
val_dataset_features = Dataset.from_pandas(val_df_features)
test_dataset_features = Dataset.from_pandas(test_df_features)

dataset_dict_features = DatasetDict({
    'train': train_dataset_features,
    'validation': val_dataset_features,
    'test': test_dataset_features
})

display(dataset_dict_features)

DatasetDict({
    train: Dataset({
        features: ['draft_id', 'pick_sequence', 'tokenized_sequence', 'pick_number', 'rank', 'event_match_wins', 'event_match_losses', 'user_n_games_bucket', 'user_game_win_rate_bucket'],
        num_rows: 13342
    })
    validation: Dataset({
        features: ['draft_id', 'pick_sequence', 'tokenized_sequence', 'pick_number', 'rank', 'event_match_wins', 'event_match_losses', 'user_n_games_bucket', 'user_game_win_rate_bucket'],
        num_rows: 1680
    })
    test: Dataset({
        features: ['draft_id', 'pick_sequence', 'tokenized_sequence', 'pick_number', 'rank', 'event_match_wins', 'event_match_losses', 'user_n_games_bucket', 'user_game_win_rate_bucket'],
        num_rows: 1680
    })
})

## Summary:

### Data Analysis Key Findings

*   Unique card names were extracted, and a vocabulary mapping card names to integer IDs (with a padding token) was created.
*   The pick sequences were successfully tokenized into numerical representations based on the created vocabulary.
*   The maximum sequence length in the tokenized data was found to be 42.
*   All tokenized sequences were padded with the '<PAD>' token (ID 0) or truncated to a fixed length of 42.
*   The dataset was split into training, validation, and testing sets with approximately 80% for training, 10% for validation, and 10% for testing.
*   Relevant features ('rank', 'event\_match\_wins', 'event\_match\_losses', 'user\_n\_games\_bucket', 'user\_game\_win\_rate\_bucket') were successfully extracted from the original data and merged with the tokenized sequences at the individual pick level.
*   The data with engineered features was also split into corresponding training, validation, and testing sets.
*   Both the data with and without engineered features were converted into Hugging Face `Dataset` and `DatasetDict` objects, ready for use with deep learning frameworks.

### Insights or Next Steps

*   The data is now fully prepared for training a machine learning model, with sequences tokenized, padded/truncated, and relevant features engineered and included.
*   The next step is to define and train a machine learning model (e.g., a recurrent neural network or transformer) using the prepared `DatasetDict` with engineered features.


In [None]:
display(train_dataset)

Dataset({
    features: ['draft_id', 'pick_sequence', 'tokenized_sequence', 'inputs', '__index_level_0__'],
    num_rows: 191
})

In [None]:
model_name = 'microsoft/deberta-v3-small'

def corr(eval_pred):
  return{'pearson': np.corrcoef(*eval_pred)[0][1]}

bs = 128
epochs = 4
lr = 8e-5
wd = 0.01

In [None]:
args = TrainingArguments(
  'outputs',
  learning_rate=lr,
  warmup_ratio=0.1,
  lr_scheduler_type='cosine',  # cosine scheduler (with warmup)
  fp16=True,
  eval_strategy="epoch",
  do_train=True,
  do_eval=True,
  no_cuda=False,
  per_device_train_batch_size=bs,
  per_device_eval_batch_size=bs*2, # evaluate using double-sized batches, since no gradients are stored so we can do twice as many rows at a time
  num_train_epochs=epochs,
  #logging_steps=100,
  weight_decay=wd,
  report_to='none'
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=max_sequence_length)
model

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [None]:
# Prepare the dataset for the model
# We need to rename the 'inputs_embeds' column to 'input_ids'
def rename_inputs(example):
    example['input_ids'] = example['inputs_embeds']
    del example['inputs_embeds']
    return example

dataset_dict_prepared = dataset_dict.map(rename_inputs, remove_columns=['pick_sequence', 'tokenized_sequence', 'inputs', '__index_level_0__'])

# Add a dummy 'labels' column for the sequence classification model
# Replace this with your actual target variable if needed
def add_labels(example):
    # Assuming a dummy label for sequence classification.
    # If your task is different (e.g., predicting the next pick), the label would be the shifted sequence.
    example['labels'] = example['input_ids']
    #example['labels'] = float(0) # Replace with your actual label calculation and ensure it's a float
    return example

dataset_dict_prepared = dataset_dict_prepared.map(add_labels)

display(dataset_dict_prepared)


trainer = Trainer(
  model,
  args,
  train_dataset=dataset_dict_prepared['train'],
  eval_dataset=dataset_dict_prepared['validation'], # validation set
  #tokenizer=tokenizer, # defined above
  compute_metrics=corr # we defined this above
)
trainer

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['draft_id', 'input_ids', 'labels'],
        num_rows: 191
    })
    validation: Dataset({
        features: ['draft_id', 'input_ids', 'labels'],
        num_rows: 24
    })
    test: Dataset({
        features: ['draft_id', 'input_ids', 'labels'],
        num_rows: 24
    })
})

<transformers.trainer.Trainer at 0x7b9278820650>

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,20990.925781,0.119973
2,No log,20990.925781,0.119973
3,No log,20990.925781,0.119973
4,No log,20990.925781,0.119973


TrainOutput(global_step=8, training_loss=21661.724609375, metrics={'train_runtime': 27.2564, 'train_samples_per_second': 28.03, 'train_steps_per_second': 0.294, 'total_flos': 8308198134432.0, 'train_loss': 21661.724609375, 'epoch': 4.0})

# Part II: Use pool/pack cards as inputs, pick as target.

In [7]:
# Create a list to store the pick, pack, and pool information
pick_pack_pool_data = []

# Iterate through each row of the original draftdata
for index, row in draftdata.iterrows():
    # Extract the pick
    pick = row['pick']

    # Extract the pack cards (columns starting with 'pack_')
    pack_cards = [col.replace('pack_card_', '') for col in draftdata.columns if col.startswith('pack_') and row[col] > 0]

    # Extract the pool cards (columns starting with 'pool_')
    pool_cards = [col.replace('pool_', '') for col in draftdata.columns if col.startswith('pool_') and row[col] > 0]

    # Append the data to the list
    pick_pack_pool_data.append({
        'draft_id': row['draft_id'],
        'pack_number': row['pack_number'],
        'pick_number': row['pick_number'],
        'pick': pick,
        'pack': pack_cards,
        'pool': pool_cards
    })

# Create a new DataFrame from the list
pick_pack_pool_df = pd.DataFrame(pick_pack_pool_data)

# Display the new DataFrame
display(pick_pack_pool_df.head())

Unnamed: 0,draft_id,pack_number,pick_number,pick,pack,pool
0,53401b113a4f425fa26e60edd314dd27,0,0,Infernal Phantom,"[Cursed Recording, Emerge from the Cocoon, Fea...",[]
1,53401b113a4f425fa26e60edd314dd27,0,1,Floodpits Drowner,"[Floodpits Drowner, Forest, Hardened Escort, M...",[Infernal Phantom]
2,53401b113a4f425fa26e60edd314dd27,0,2,Fear of Being Hunted,"[Balemurk Leech, Bashful Beastie, Bleeding Woo...","[Floodpits Drowner, Infernal Phantom]"
3,53401b113a4f425fa26e60edd314dd27,0,3,Fear of Being Hunted,"[Balemurk Leech, Conductive Machete, Cult Heal...","[Fear of Being Hunted, Floodpits Drowner, Infe..."
4,53401b113a4f425fa26e60edd314dd27,0,4,Undead Sprinter,"[Bedhead Beastie, Conductive Machete, Daggerma...","[Fear of Being Hunted, Floodpits Drowner, Infe..."


# Preparation and Training II
Create a dataframe from `draftdata` with columns 'pick', 'pack', and 'pool', where 'pack' and 'pool' are sequences of cards. Then, tokenize the card names in 'pick', 'pack', and 'pool' columns, pad or truncate the tokenized sequences in 'pack' and 'pool' to a fixed length, merge with selected features from `draftdata`, split the data into training, validation, and testing sets, create Hugging Face `Dataset` and `DatasetDict` objects, define and train a model to predict 'pick' based on 'pack', 'pool', and engineered features, and evaluate the model.

## Tokenization

### Subtask:
Convert the card names in the 'pick', 'pack', and 'pool' columns of `pick_pack_pool_df` into numerical tokens using the existing vocabulary.


**Reasoning**:
Tokenize the 'pick', 'pack', and 'pool' columns using the existing vocabulary.



In [14]:
# Re-define the vocabulary and card_to_id mapping
all_picks = [card for sequence in pick_sequence_df['pick_sequence'] for card in sequence]
unique_cards = sorted(list(set(all_picks)))

card_to_id = {'<PAD>': 0}
for i, card in enumerate(unique_cards):
    card_to_id[card] = i + 1

vocabulary = card_to_id

# Tokenize the 'pick' column
pick_pack_pool_df['pick_token'] = pick_pack_pool_df['pick'].apply(lambda x: card_to_id.get(x, card_to_id['<PAD>']))

# Tokenize the 'pack' column
pick_pack_pool_df['pack_tokens'] = pick_pack_pool_df['pack'].apply(lambda pack_list: [card_to_id.get(card, card_to_id['<PAD>']) for card in pack_list])

# Tokenize the 'pool' column
pick_pack_pool_df['pool_tokens'] = pick_pack_pool_df['pool'].apply(lambda pool_list: [card_to_id.get(card, card_to_id['<PAD>']) for card in pool_list])

# Display the updated DataFrame
display(pick_pack_pool_df.head())

Unnamed: 0,draft_id,pack_number,pick_number,pick,pack,pool,pick_token,pack_tokens,pool_tokens
0,53401b113a4f425fa26e60edd314dd27,0,0,Infernal Phantom,"[Cursed Recording, Emerge from the Cocoon, Fea...",[],120,"[39, 57, 76, 88, 119, 120, 151, 157, 176, 186,...",[]
1,53401b113a4f425fa26e60edd314dd27,0,1,Floodpits Drowner,"[Floodpits Drowner, Forest, Hardened Escort, M...",[Infernal Phantom],87,"[87, 88, 112, 143, 146, 162, 198, 203, 205, 21...",[120]
2,53401b113a4f425fa26e60edd314dd27,0,2,Fear of Being Hunted,"[Balemurk Leech, Bashful Beastie, Bleeding Woo...","[Floodpits Drowner, Infernal Phantom]",71,"[9, 12, 18, 71, 91, 110, 155, 168, 186, 225, 2...","[87, 120]"
3,53401b113a4f425fa26e60edd314dd27,0,3,Fear of Being Hunted,"[Balemurk Leech, Conductive Machete, Cult Heal...","[Fear of Being Hunted, Floodpits Drowner, Infe...",71,"[9, 33, 38, 63, 71, 84, 89, 110, 229, 248, 268]","[71, 87, 120]"
4,53401b113a4f425fa26e60edd314dd27,0,4,Undead Sprinter,"[Bedhead Beastie, Conductive Machete, Daggerma...","[Fear of Being Hunted, Floodpits Drowner, Infe...",249,"[15, 33, 42, 112, 116, 126, 144, 212, 248, 249]","[71, 87, 120]"


## Sequence padding/truncation

### Subtask:
Pad or truncate the tokenized sequences in the 'pack' and 'pool' columns to a fixed length.


**Reasoning**:
Calculate the maximum sequence lengths for 'pack_tokens' and 'pool_tokens', choose a fixed length, and then pad and truncate the sequences in these columns.



In [9]:
# Determine the maximum sequence length for 'pack_tokens' and 'pool_tokens'
max_pack_length = pick_pack_pool_df['pack_tokens'].apply(len).max()
max_pool_length = pick_pack_pool_df['pool_tokens'].apply(len).max()
print(f"Maximum pack sequence length: {max_pack_length}")
print(f"Maximum pool sequence length: {max_pool_length}")

# Define a fixed maximum sequence length for padding/truncation.
# Let's use the maximum pool length as it's larger than the maximum pack length.
defined_max_length_seq = max_pool_length

# Pad and truncate 'pack_tokens' sequences
pick_pack_pool_df['pack_tokens_padded'] = pick_pack_pool_df['pack_tokens'].apply(
    lambda sequence: sequence[:defined_max_length_seq] + [card_to_id['<PAD>']] * (defined_max_length_seq - len(sequence))
)

# Pad and truncate 'pool_tokens' sequences
pick_pack_pool_df['pool_tokens_padded'] = pick_pack_pool_df['pool_tokens'].apply(
    lambda sequence: sequence[:defined_max_length_seq] + [card_to_id['<PAD>']] * (defined_max_length_seq - len(sequence))
)

# Display the updated DataFrame with padded/truncated sequences
display(pick_pack_pool_df.head())

Maximum pack sequence length: 15
Maximum pool sequence length: 40


Unnamed: 0,draft_id,pack_number,pick_number,pick,pack,pool,pick_token,pack_tokens,pool_tokens,pack_tokens_padded,pool_tokens_padded
0,53401b113a4f425fa26e60edd314dd27,0,0,Infernal Phantom,"[Cursed Recording, Emerge from the Cocoon, Fea...",[],120,"[39, 57, 76, 88, 119, 120, 151, 157, 176, 186,...",[],"[39, 57, 76, 88, 119, 120, 151, 157, 176, 186,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,53401b113a4f425fa26e60edd314dd27,0,1,Floodpits Drowner,"[Floodpits Drowner, Forest, Hardened Escort, M...",[Infernal Phantom],87,"[87, 88, 112, 143, 146, 162, 198, 203, 205, 21...",[120],"[87, 88, 112, 143, 146, 162, 198, 203, 205, 21...","[120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,53401b113a4f425fa26e60edd314dd27,0,2,Fear of Being Hunted,"[Balemurk Leech, Bashful Beastie, Bleeding Woo...","[Floodpits Drowner, Infernal Phantom]",71,"[9, 12, 18, 71, 91, 110, 155, 168, 186, 225, 2...","[87, 120]","[9, 12, 18, 71, 91, 110, 155, 168, 186, 225, 2...","[87, 120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,53401b113a4f425fa26e60edd314dd27,0,3,Fear of Being Hunted,"[Balemurk Leech, Conductive Machete, Cult Heal...","[Fear of Being Hunted, Floodpits Drowner, Infe...",71,"[9, 33, 38, 63, 71, 84, 89, 110, 229, 248, 268]","[71, 87, 120]","[9, 33, 38, 63, 71, 84, 89, 110, 229, 248, 268...","[71, 87, 120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,53401b113a4f425fa26e60edd314dd27,0,4,Undead Sprinter,"[Bedhead Beastie, Conductive Machete, Daggerma...","[Fear of Being Hunted, Floodpits Drowner, Infe...",249,"[15, 33, 42, 112, 116, 126, 144, 212, 248, 249]","[71, 87, 120]","[15, 33, 42, 112, 116, 126, 144, 212, 248, 249...","[71, 87, 120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [33]:
# Create attention mask for padded pack tokens
pick_pack_pool_df['attention_mask'] = pick_pack_pool_df['pack_tokens_padded'].apply(lambda sequence: [1 if token != card_to_id['<PAD>'] else 0 for token in sequence])

# Display the updated DataFrame with the attention mask
display(pick_pack_pool_df.head())

Unnamed: 0,draft_id,pack_number,pick_number,pick,pack,pool,pick_token,pack_tokens,pool_tokens,pack_tokens_padded,pool_tokens_padded,input_ids,labels,attention_mask
0,53401b113a4f425fa26e60edd314dd27,0,0,Infernal Phantom,"[Cursed Recording, Emerge from the Cocoon, Fea...",[],120,"[39, 57, 76, 88, 119, 120, 151, 157, 176, 186,...",[],"[39, 57, 76, 88, 119, 120, 151, 157, 176, 186,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[39, 57, 76, 88, 119, 120, 151, 157, 176, 186,...",120,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
1,53401b113a4f425fa26e60edd314dd27,0,1,Floodpits Drowner,"[Floodpits Drowner, Forest, Hardened Escort, M...",[Infernal Phantom],87,"[87, 88, 112, 143, 146, 162, 198, 203, 205, 21...",[120],"[87, 88, 112, 143, 146, 162, 198, 203, 205, 21...","[120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[87, 88, 112, 143, 146, 162, 198, 203, 205, 21...",87,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
2,53401b113a4f425fa26e60edd314dd27,0,2,Fear of Being Hunted,"[Balemurk Leech, Bashful Beastie, Bleeding Woo...","[Floodpits Drowner, Infernal Phantom]",71,"[9, 12, 18, 71, 91, 110, 155, 168, 186, 225, 2...","[87, 120]","[9, 12, 18, 71, 91, 110, 155, 168, 186, 225, 2...","[87, 120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[9, 12, 18, 71, 91, 110, 155, 168, 186, 225, 2...",71,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
3,53401b113a4f425fa26e60edd314dd27,0,3,Fear of Being Hunted,"[Balemurk Leech, Conductive Machete, Cult Heal...","[Fear of Being Hunted, Floodpits Drowner, Infe...",71,"[9, 33, 38, 63, 71, 84, 89, 110, 229, 248, 268]","[71, 87, 120]","[9, 33, 38, 63, 71, 84, 89, 110, 229, 248, 268...","[71, 87, 120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[9, 33, 38, 63, 71, 84, 89, 110, 229, 248, 268...",71,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."
4,53401b113a4f425fa26e60edd314dd27,0,4,Undead Sprinter,"[Bedhead Beastie, Conductive Machete, Daggerma...","[Fear of Being Hunted, Floodpits Drowner, Infe...",249,"[15, 33, 42, 112, 116, 126, 144, 212, 248, 249]","[71, 87, 120]","[15, 33, 42, 112, 116, 126, 144, 212, 248, 249...","[71, 87, 120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[15, 33, 42, 112, 116, 126, 144, 212, 248, 249...",249,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."


In [10]:
#Rename the columns to those expected by the model.
pick_pack_pool_df['input_ids'] = pick_pack_pool_df['pack_tokens_padded']
pick_pack_pool_df['labels'] = pick_pack_pool_df['pick_token']

## Splitting data

Split the prepared data with tokenized and padded/truncated sequences into training, validation, and testing sets.

In [34]:
from sklearn.model_selection import train_test_split

# Define features (pack and pool tokens) and target (pick token)
features_df = pick_pack_pool_df[['input_ids','attention_mask']]
target_series = pick_pack_pool_df['labels']

# Split into training and temporary sets
X_train, X_temp, y_train, y_temp = train_test_split(features_df, target_series, test_size=0.2, random_state=42)

# Split temporary set into validation and testing sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Combine features and target back into dataframes for easier handling with Datasets
train_df_split = X_train.copy()
train_df_split['labels'] = y_train

val_df_split = X_val.copy()
val_df_split['labels'] = y_val

test_df_split = X_test.copy()
test_df_split['labels'] = y_test


# Display the shapes of the resulting dataframes
print("Training set shape:", train_df_split.shape)
print("Validation set shape:", val_df_split.shape)
print("Testing set shape:", test_df_split.shape)

Training set shape: (8000, 3)
Validation set shape: (1000, 3)
Testing set shape: (1000, 3)


## Creating dataset and dataloader



Convert the pandas DataFrames to Hugging Face Dataset objects and combine them into a DatasetDict.

In [35]:
train_dataset_split = Dataset.from_pandas(train_df_split)
val_dataset_split = Dataset.from_pandas(val_df_split)
test_dataset_split = Dataset.from_pandas(test_df_split)

dataset_dict_split = DatasetDict({
    'train': train_dataset_split,
    'validation': val_dataset_split,
    'test': test_dataset_split
})

display(dataset_dict_split)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', '__index_level_0__'],
        num_rows: 1000
    })
})

In [29]:
model_name = 'microsoft/deberta-v3-small'

def corr(eval_pred):
  return{'pearson': np.corrcoef(*eval_pred)[0][1]}

bs = 128
epochs = 5
lr = 8e-5
wd = 0.01

In [36]:
args = TrainingArguments(
  'outputs',
  learning_rate=lr,
  warmup_ratio=0.1,
  lr_scheduler_type='cosine',  # cosine scheduler (with warmup)
  fp16=True,
  eval_strategy="epoch",
  do_train=True,
  do_eval=True,
  no_cuda=False,
  per_device_train_batch_size=bs,
  per_device_eval_batch_size=bs*2, # evaluate using double-sized batches, since no gradients are stored so we can do twice as many rows at a time
  num_train_epochs=epochs,
  logging_steps=100,
  weight_decay=wd,
  report_to='none'
)

In [37]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(vocabulary))
model

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [38]:
import torch.nn as nn

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
  model,
  args,
  train_dataset=dataset_dict_split['train'],
  eval_dataset=dataset_dict_split['validation'], # validation set
  #tokenizer=tokenizer, # defined above
  #compute_metrics=corr # we defined this above
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,5.373836
2,5.428500,5.20294
3,5.428500,5.063106
4,5.134100,5.011009
5,4.972000,4.992558


TrainOutput(global_step=315, training_loss=5.166092354910714, metrics={'train_runtime': 96.3162, 'train_samples_per_second': 415.299, 'train_steps_per_second': 3.27, 'total_flos': 416035056000000.0, 'train_loss': 5.166092354910714, 'epoch': 5.0})

## Evaluation


In [27]:
# Evaluate the model on the test set
evaluation_results = trainer.evaluate(dataset_dict_split['test'])

# Display the evaluation results
print(evaluation_results)

{'eval_loss': 5.1539626121521, 'eval_runtime': 0.5131, 'eval_samples_per_second': 1948.816, 'eval_steps_per_second': 7.795, 'epoch': 4.0}
