In [1]:
!pip install torch





In [2]:
!pip install transformers datasets evaluate accelerate setfit





In [3]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
print(model)

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inp

In [4]:
text = "Aatrox has a [MASK] early game"

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.model_max_length

1000000000000000019884624838656

In [6]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> Aatrox has a very early game'
'>>> Aatrox has a similar early game'
'>>> Aatrox has a relatively early game'
'>>> Aatrox has a typical early game'
'>>> Aatrox has a notable early game'


In [7]:
from datasets import load_dataset
corpus = load_dataset("avinot/LoL-Champions-Corpus")
corpus

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3393
    })
    test: Dataset({
        features: ['text'],
        num_rows: 847
    })
})

In [8]:
sample = corpus["train"].select(range(3))

for row in sample:
    print(f"\n'>>> Content: {row['text']}'")


'>>> Content: Morgana is quite decent in team fights as she will be able to run down enemies with her Q and Ultimate R. Her E will allow her to ave her allies from hard CC and engages.Adding 2 points in her Ultimate R will give her a massive power spike. She will be able to use the ability frequently, impacting team fights often.At level nine, her Q will be maxed out. It will let her get frequent picks on the enemy, especially from the fog of war. '

'>>> Content: Jarvan’s all-in can be interrupted, but it will be hard. It’s best to try and dodge it and then turn it around.Try to keep the Drake warded so you can see if Jarvan is taking it. Ping and communicate with your team so you can counter them and try to kill them or steal the Dragon if possible.Beware of Jarvan's early game pressure. They’re much stronger than you and will try to make plays to get themselves and their teammates ahead.Jarvan truly comes online when they have’ve unlocked their Ultimate and hit level 6. Beware that

In [9]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = corpus.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 3393
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 847
    })
})

In [10]:
chunk_size = 128

In [11]:
tokenized_samples = tokenized_datasets["train"][:3]
for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Corpus {idx} length: {len(sample)}'")

'>>> Corpus 0 length: 102'
'>>> Corpus 1 length: 141'
'>>> Corpus 2 length: 97'


In [12]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated corpus length: {total_length}'")

'>>> Concatenated corpus length: 340'


In [13]:
chunks = {
    k : [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 84'


In [14]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
lm_dataset = tokenized_datasets.map(group_texts, batched=True)
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3105
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 776
    })
})

In [16]:
tokenizer.decode(lm_dataset["train"][1]["input_ids"])

"it and then turn it around. try to keep the drake warded so you can see if jarvan is taking it. ping and communicate with your team so you can counter them and try to kill them or steal the dragon if possible. beware of jarvan's early game pressure. they ’ re much stronger than you and will try to make plays to get themselves and their teammates ahead. jarvan truly comes online when they have ’ ve unlocked their ultimate and hit level 6. beware that they will start ganking more frequently when they have it up. [SEP] [CLS] graves is a good early game skirmisher. make sure you"

In [17]:
tokenizer.decode(lm_dataset["train"][1]["labels"])

"it and then turn it around. try to keep the drake warded so you can see if jarvan is taking it. ping and communicate with your team so you can counter them and try to kill them or steal the dragon if possible. beware of jarvan's early game pressure. they ’ re much stronger than you and will try to make plays to get themselves and their teammates ahead. jarvan truly comes online when they have ’ ve unlocked their ultimate and hit level 6. beware that they will start ganking more frequently when they have it up. [SEP] [CLS] graves is a good early game skirmisher. make sure you"

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [19]:
samples = [lm_dataset["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] morgana is quite decent in team fights as she will be able to run down enemies with her [MASK] and ultimate r. her [MASK] will allow her to ave her allies from hard [MASK] and [MASK]. adding 2 points in [MASK] [MASK] r will give her a massive power spike. she [MASK] be able to [MASK] the ability frequently janeiro impacting team fights [MASK]. at [MASK] nine, her q will be max [MASK] out. it roasted [MASK] her get frequent picks on the enemy [MASK] especially from the fog of war. [SEP] [CLS] jarvan ’ [MASK] all [MASK] in can be′, but it will be hard [MASK] it ’ s [MASK] to try and dodge'

'>>> it and then [MASK] it around. try to [MASK] the drake [MASK]ed so [MASK] can see if jarvan is [MASK] it. ping and communicate with your team so you can counter them and try to kill them or steal the dragon if [MASK]. beware of jarvan's early game pressure [MASK] they [MASK] [MASK] much stronger than you and [MASK] try to make plays to get themselves and their teammates ahead. jarvan t

In [20]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [21]:
samples = [lm_dataset["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] [MASK] [MASK] [MASK] quite decent [MASK] team fights as she will be able to [MASK] down enemies [MASK] her q [MASK] [MASK] r. her e will allow [MASK] to ave her allies from hard cc and engages. adding [MASK] points in her ultimate r [MASK] [MASK] her a [MASK] power spike. [MASK] will be able to use the ability frequently [MASK] impacting team fights often. at level nine, her q will be maxed [MASK]. it will let her [MASK] frequent picks on the [MASK], especially from [MASK] fog of [MASK] [MASK] [SEP] [CLS] jarvan ’ s all - in can be interrupted, but [MASK] will be hard. it ’ s [MASK] [MASK] try and dodge'

'>>> it and then turn it around. try to keep [MASK] drake warded so [MASK] can [MASK] if [MASK] [MASK] is [MASK] it [MASK] ping and communicate [MASK] your team so you can counter them and try to kill them or steal [MASK] dragon if possible. beware [MASK] jarvan [MASK] s early game [MASK]. they ’ [MASK] much stronger than you and will try to make [MASK] to get themselves [

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]


training_args = TrainingArguments(
    output_dir=f"./distilbert-lolchamps",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=False,
    logging_steps=logging_steps,
    hub_token="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"
)

In [24]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)



In [25]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/13 [00:00<?, ?it/s]

>>> Perplexity: 33.87


In [26]:
trainer.train()

  0%|          | 0/147 [00:00<?, ?it/s]

{'loss': 2.7422, 'grad_norm': 3.426318407058716, 'learning_rate': 1.3469387755102042e-05, 'epoch': 0.98}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.320233106613159, 'eval_runtime': 195.6752, 'eval_samples_per_second': 3.966, 'eval_steps_per_second': 0.066, 'epoch': 1.0}
{'loss': 2.3298, 'grad_norm': 3.59854793548584, 'learning_rate': 6.938775510204082e-06, 'epoch': 1.96}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.109546661376953, 'eval_runtime': 149.0327, 'eval_samples_per_second': 5.207, 'eval_steps_per_second': 0.087, 'epoch': 2.0}
{'loss': 2.1925, 'grad_norm': 3.4084157943725586, 'learning_rate': 4.0816326530612243e-07, 'epoch': 2.94}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.0555899143218994, 'eval_runtime': 176.3516, 'eval_samples_per_second': 4.4, 'eval_steps_per_second': 0.074, 'epoch': 3.0}
{'train_runtime': 6612.5552, 'train_samples_per_second': 1.409, 'train_steps_per_second': 0.022, 'train_loss': 2.4157816666324123, 'epoch': 3.0}


TrainOutput(global_step=147, training_loss=2.4157816666324123, metrics={'train_runtime': 6612.5552, 'train_samples_per_second': 1.409, 'train_steps_per_second': 0.022, 'total_flos': 308701792258560.0, 'train_loss': 2.4157816666324123, 'epoch': 3.0})

In [27]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/13 [00:00<?, ?it/s]

>>> Perplexity: 7.73


In [28]:
test_text = "Orianna has a [MASK] early game"

inputs = tokenizer(test_text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {test_text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> Orianna has a very early game'
'>>> Orianna has a good early game'
'>>> Orianna has a great early game'
'>>> Orianna has a strong early game'
'>>> Orianna has a fairly early game'


In [29]:
from transformers import pipeline

feature_extractor = pipeline("feature-extraction", framework="pt", model=model)
text = "Orianna"
feature_extractor(text, return_tensors="pt")[0].numpy().mean(axis=0)

Exception: Impossible to guess which tokenizer to use. Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer.