In [1]:
!pip install torch





In [2]:
!pip install transformers datasets evaluate accelerate





In [54]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
print(model)

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inp

In [30]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [31]:
text = "This is a great [MASK]."

In [32]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.model_max_length

1000000000000000019884624838656

In [33]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [34]:
from datasets import load_dataset
corpus = load_dataset("avinot/LoL-Champions-Corpus")
corpus

Downloading readme:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/300k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2260 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/564 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['text'],
        num_rows: 564
    })
})

In [35]:
sample = corpus["train"].select(range(3))

for row in sample:
    print(f"\n'>>> Content: {row['text']}'")


'>>> Content: Extremely squishy and can be easily killed if he is CC’d. His positioning matters in team fights as well, otherwise, he can get flanked by enemy assassins and get instantly deleted.Without his Ultimate c or s, his ability to dominate a team fight drastically decreases. Fighting him during this stage will give the enemy team a much greater chance of winning.Practically gets countered by a Control Ward as it forces him to reroute. This is because of the short-ranged nature of his auto-attacks, which makes him get detected quite easily. This is extremely disadvantageous when he is Jungling as it forces him to reveal himself pre-maturely.'

'>>> Content: Another point in his Ultimate R will increase his pick potential. He can easily lane gank the enemy laners and accumulate leads.He will now have multiple items at his disposal. This means that his damage will increase significantly, and he should have an easy time blowing up enemies.Rengar's presence during mid-game fights i

In [36]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = corpus.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

Map:   0%|          | 0/2260 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 564
    })
})

In [37]:
chunk_size = 128

In [38]:
tokenized_samples = tokenized_datasets["train"][:3]
for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Corpus {idx} length: {len(sample)}'")

'>>> Corpus 0 length: 136'
'>>> Corpus 1 length: 91'
'>>> Corpus 2 length: 84'


In [39]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated corpus length: {total_length}'")

'>>> Concatenated corpus length: 311'


In [40]:
chunks = {
    k : [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 55'


In [41]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [42]:
lm_dataset = tokenized_datasets.map(group_texts, batched=True)
lm_dataset

Map:   0%|          | 0/2260 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 2043
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 510
    })
})

In [43]:
tokenizer.decode(lm_dataset["train"][1]["input_ids"])

"reveal himself pre - maturely. [SEP] [CLS] another point in his ultimate r will increase his pick potential. he can easily lane gank the enemy laners and accumulate leads. he will now have multiple items at his disposal. this means that his damage will increase significantly, and he should have an easy time blowing up enemies. rengar's presence during mid - game fights is really fabulous. he should be able to win multiple fights and zone off enemies by using his ultimate r correctly. [SEP] [CLS] this champions level 2 is incredibly strong. they can often gain a health advantage or summoner advantage in the lane by playing aggressive and looking for"

In [44]:
tokenizer.decode(lm_dataset["train"][1]["labels"])

"reveal himself pre - maturely. [SEP] [CLS] another point in his ultimate r will increase his pick potential. he can easily lane gank the enemy laners and accumulate leads. he will now have multiple items at his disposal. this means that his damage will increase significantly, and he should have an easy time blowing up enemies. rengar's presence during mid - game fights is really fabulous. he should be able to win multiple fights and zone off enemies by using his ultimate r correctly. [SEP] [CLS] this champions level 2 is incredibly strong. they can often gain a health advantage or summoner advantage in the lane by playing aggressive and looking for"

In [45]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [46]:
samples = [lm_dataset["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] extremely squishy and can be easily killed [MASK] he is cc ’ d educating his [MASK] [MASK] in team fights as well, otherwise, he [MASK] get flanked by [MASK] assassins and get instantly deleted. without his [MASK] c or s, his ability to dominate a team fight drastically decreases. fighting him during this stage will give the enemy team a much [MASK] [MASK] of winning. practically gets countered by [MASK] control ward as it [MASK] him to reroute. this [MASK] [MASK] of the short - ranged nature [MASK] his auto - attacks [MASK] which makes [MASK] get detected quite easily. this is extremely disadvantageous when he is jungling faa it forces him to'

'>>> reveal himself pre - maturely. [SEP] [CLS] another point in his ultimate r will increase his pick potential. [MASK] can easily lane gank [MASK] enemy laners and lore leads. he will now have multiple items at his disposal. this means that his [MASK] will increase significantly, and he should have an easy time blowing [MASK] enem

In [47]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [48]:
samples = [lm_dataset["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] extremely squishy and can be easily killed if he is [MASK] ’ [MASK]. [MASK] positioning matters [MASK] [MASK] [MASK] [MASK] well, otherwise, he can get flanked by enemy [MASK] and get instantly [MASK] [MASK] without his ultimate c or s, his ability to dominate a team fight drastically [MASK]. fighting him during this [MASK] will give the enemy team [MASK] much greater chance of winning. practically gets countered by a [MASK] ward as it forces him to reroute [MASK] this is because of the short [MASK] ranged nature of his [MASK] [MASK] [MASK], which makes him get detected [MASK] easily. this is extremely disadvantageous [MASK] he [MASK] jungling as it forces him to'

'>>> reveal himself pre [MASK] maturely [MASK] [SEP] [CLS] another point [MASK] his ultimate [MASK] will increase his [MASK] potential [MASK] he can easily lane [MASK] [MASK] the enemy [MASK] [MASK] [MASK] accumulate leads [MASK] he will now have multiple items at his disposal. this [MASK] [MASK] his damage will 

In [49]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [61]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

print(model_name)

training_args = TrainingArguments(
    output_dir=f"./{model_name}-finetuned-LoL-champions",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=False,
    logging_steps=logging_steps,
)

distilbert-base-uncased


In [64]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [65]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/8 [00:00<?, ?it/s]

>>> Perplexity: 34.97


In [66]:
trainer.train()

  0%|          | 0/96 [00:00<?, ?it/s]

{'loss': 2.7972, 'grad_norm': 3.6852333545684814, 'learning_rate': 1.3541666666666668e-05, 'epoch': 0.97}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 2.420370101928711, 'eval_runtime': 117.2594, 'eval_samples_per_second': 4.349, 'eval_steps_per_second': 0.068, 'epoch': 1.0}
{'loss': 2.426, 'grad_norm': 3.4909021854400635, 'learning_rate': 7.083333333333335e-06, 'epoch': 1.94}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 2.213554620742798, 'eval_runtime': 109.7001, 'eval_samples_per_second': 4.649, 'eval_steps_per_second': 0.073, 'epoch': 2.0}
{'loss': 2.2728, 'grad_norm': 3.4451940059661865, 'learning_rate': 6.25e-07, 'epoch': 2.91}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 2.1835858821868896, 'eval_runtime': 102.7926, 'eval_samples_per_second': 4.961, 'eval_steps_per_second': 0.078, 'epoch': 3.0}
{'train_runtime': 4284.4993, 'train_samples_per_second': 1.431, 'train_steps_per_second': 0.022, 'train_loss': 2.4906685252984366, 'epoch': 3.0}


TrainOutput(global_step=96, training_loss=2.4906685252984366, metrics={'train_runtime': 4284.4993, 'train_samples_per_second': 1.431, 'train_steps_per_second': 0.022, 'total_flos': 203116831428096.0, 'train_loss': 2.4906685252984366, 'epoch': 3.0})

In [67]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/8 [00:00<?, ?it/s]

>>> Perplexity: 8.96


In [72]:
test_text = "Aatrox has a [MASK] early game."

inputs = tokenizer(test_text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {test_text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> Aatrox has a very early game.'
'>>> Aatrox has a good early game.'
'>>> Aatrox has a relatively early game.'
'>>> Aatrox has a fairly early game.'
'>>> Aatrox has a strong early game.'
