In [1]:
!pip install torch





In [2]:
!pip install transformers datasets evaluate accelerate

Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
     ---------------------------------------- 9.0/9.0 MB 18.0 MB/s eta 0:00:00
Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
     ------------------------------------- 542.0/542.0 kB 17.2 MB/s eta 0:00:00
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
     ---------------------------------------- 84.1/84.1 kB 4.6 MB/s eta 0:00:00
Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
     ------------------------------------- 297.6/297.6 kB 18.0 MB/s eta 0:00:00
Collecting regex!=2019.12.17
  Downloading regex-2024.4.16-cp39-cp39-win_amd64.whl (268 kB)
     ------------------------------------- 268.9/268.9 kB 17.2 MB/s eta 0:00:00
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.3-cp39-none-win_amd64.whl (287 kB)
     ------------------------------------- 287.9/287.9 kB 17.4 MB/s eta 0:00:00
Collecting tokenizer



In [3]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [4]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [5]:
text = "This is a great [MASK]."

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.model_max_length

1000000000000000019884624838656

In [7]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [8]:
from datasets import load_dataset
corpus = load_dataset("avinot/LoL-Champions-Corpus")
corpus

Downloading readme: 100%|██████████| 79.0/79.0 [00:00<00:00, 56.8kB/s]
Downloading data: 100%|██████████| 1.62M/1.62M [00:00<00:00, 6.99MB/s]
Generating train split: 100%|██████████| 2825/2825 [00:00<00:00, 81269.91 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2825
    })
})

In [9]:
sample = corpus["train"].select(range(3))

for row in sample:
    print(f"\n'>>> Content: {row['text']}'")


'>>> Content: Aatrox is a lane bully who can snowball quite quickly. If you can get an early lead, try to abuse it to get more kills in the lane to increase your chances of winning the game.You’re a very mobile champion thanks to your {{championSpells.AatroxE}}. It can be used in multiple ways and is a good way of turning around exchanges with the enemy.You have tons of sustain thanks to your Passive and Ultimate. This will increase your survivability in team fights and skirmishes.'

'>>> Content: Make sure you’re only using your abilities when you have to, especially your {{championSpells.AatroxE}}. Aatrox suffers from long cooldowns and is easily abusable when they’re on cooldown.You’ll fall off in the later stages of the game. Try and end the game as quickly as you can or snowball your lead so you’ll be a huge threat in team fights.Aatrox is not an easy champion to learn. He has a unique mechanic on his {{championSpells.AatroxQ}} which does take some time getting used to. You’ll ne

In [10]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = corpus.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

Map: 100%|██████████| 2825/2825 [00:00<00:00, 3219.47 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 2825
    })
})

In [11]:
chunk_size = 128

In [13]:
tokenized_samples = tokenized_datasets["train"][:3]
for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Corpus {idx} length: {len(sample)}'")

'>>> Corpus 0 length: 114'
'>>> Corpus 1 length: 148'
'>>> Corpus 2 length: 131'


In [15]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated corpus length: {total_length}'")

'>>> Concatenated corpus length: 393'
