In [1]:
import os

import torch
import numpy as np

from datasets import load_dataset

from tokenizers import Tokenizer, pre_tokenizers
from transformers import AutoTokenizer, PreTrainedTokenizerFast

from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer
                                
from transformers import DistilBertForMaskedLM, DistilBertConfig 

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

from huggingface_hub import login
import wandb
                               
UNK_TOKEN = '[UNK]'
SPL_TOKENS = ["[PAD]", "[CLS]", "[SEP]", "[MASK]"] + [UNK_TOKEN]  # special tokens
MAX_LENGTH = 512

TRAINING_SIZE = 1000
VOCABULARY_SIZE = 1000

TOKENIZERS = 'BPE'

HUGGINGFACE_TOKEN = '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'

BATCH_SIZE = 16

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
login(HUGGINGFACE_TOKEN)

In [3]:
wandb.login()
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

NameError: name 'wandb' is not defined

In [None]:
model_folder = 'models/'
tokenizer_folder = "tokenizers/"

tokenizer_file = f"{tokenizer_folder}/tokenizer_{TOKENIZERS}_{VOCABULARY_SIZE}_{TRAINING_SIZE}.json"
model_file = f"{model_folder}/model_{TOKENIZERS}_{VOCABULARY_SIZE}_{TRAINING_SIZE}.json"


In [8]:
DATASET = load_dataset("oscar-corpus/oscar",
                        language="tr", 
                        streaming=True, # optional
                        split="train") # optional, but the dataset only has a train split

In [11]:
UNK_TOKEN = "[UNK]"
SPL_TOKENS = ["[PAD]", "[CLS]", "[SEP]", "[MASK]"] + [UNK_TOKEN]

In [14]:
SPL_TOKENS

['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']

In [23]:
def dataset_text_iterator(dataset):
    """Yields the 'text' column from an iterable dataset.


    Args:
        dataset (IterableDataset): An iterable dataset where each sample 
                                    is expected to be a dictionary with a 
                                    'text' field.

    Yields:
        str: The text content from each sample in the dataset.
    """
    for sample in dataset:
        yield sample['text']

In [None]:
dataset = dataset.take(training_size)

In [25]:
tokenizer = Tokenizer(BPE(unk_token = UNK_TOKEN))
trainer = BpeTrainer(special_tokens = SPL_TOKENS, vocab_size=VOCABULARY_SIZE)

tokenizer.train_from_iterator(dataset_text_iterator(dataset), trainer)
tokenizer.save(tokenizer_file)

In [26]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
tokenizer.add_special_tokens({
    "pad_token": "[PAD]",
    "unk_token": "[UNK]",
    "cls_token": "[CLS]",
    "sep_token": "[SEP]",
    "mask_token": "[MASK]"
})

0

In [27]:
def encode_with_truncation(examples, tokenizer=tokenizer):
    """_summary_

    Args:
        examples (_type_): _description_

    Returns:
        _type_: _description_
    """    
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH, return_special_tokens_mask=True)

In [28]:
configuration = DistilBertConfig(vocab_size=VOCABULARY_SIZE)

#Distilbert with a Masked language modeling head on top.
# only use MLM, does not perform Next-Sentence-Prediction. But this should be sufficient.
model = DistilBertForMaskedLM(configuration) 

In [29]:
tokenized_dataset = dataset.map(encode_with_truncation, batched=True, remove_columns=["text"])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
    return_tensors='pt'
)

training_args = TrainingArguments(
    output_dir=f"./{model_file}",
    overwrite_output_dir=True,
    learning_rate=5e-5,
    lr_scheduler_type='linear',
    num_train_epochs=3000,
    max_steps = 3,
    per_device_train_batch_size=BATCH_SIZE,
    logging_dir="./logs",
    save_strategy='steps',
    logging_steps=1,
    #load_best_model_at_end=True,
    #eval_strategy="steps",
    #eval_steps=1, 
    use_cpu=False,
    report_to="wandb",
    run_name='first_try'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

  trainer = Trainer(
max_steps is given, it will override any value given in num_train_epochs


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777286247, max=1.0…

  0%|          | 0/3 [00:00<?, ?it/s]

{'loss': 7.0439, 'grad_norm': 2.877178430557251, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.33}
{'loss': 6.9932, 'grad_norm': 2.752091407775879, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.67}
{'loss': 6.9209, 'grad_norm': 2.731001853942871, 'learning_rate': 0.0, 'epoch': 1.0}


[34m[1mwandb[0m: Adding directory to artifact (.\models\model_BPE_1000_1000.json\checkpoint-3)... Done. 5.4s
max_steps is given, it will override any value given in num_train_epochs


{'train_runtime': 270.6872, 'train_samples_per_second': 0.177, 'train_steps_per_second': 0.011, 'train_loss': 6.98600435256958, 'epoch': 1.0}


TrainOutput(global_step=3, training_loss=6.98600435256958, metrics={'train_runtime': 270.6872, 'train_samples_per_second': 0.177, 'train_steps_per_second': 0.011, 'total_flos': 6358582296576.0, 'train_loss': 6.98600435256958, 'epoch': 1.0})

In [5]:
import torch
import numpy as np
from datasets import load_metric

# Load the perplexity metric (or any other metric, e.g., accuracy)
metric = load_metric("perplexity")

# Define a compute_metrics function
def compute_metrics(p):
    logits, labels = p
    greedy_prediction = torch.argmax(logits, dim=-1)
    # Compute loss using cross-entropy
    loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), reduction="mean")
    # Calculate perplexity as exp(loss)
    perplexity = torch.exp(loss)
    
    return {"perplexity": perplexity.item()}

# Simulate some logits and labels for a batch of predictions
# Assuming a vocab size of 10 and a sequence length of 5 for this example
batch_size = 3
seq_len = 5
vocab_size = 10

# Random logits from a model's output (e.g., from a language model)
logits = torch.randn(batch_size, seq_len, vocab_size)

# Simulated labels (true values) for the same batch and sequence length
labels = torch.randint(0, vocab_size, (batch_size, seq_len))

# Print logits and labels for reference
print("Logits:\n", logits.shape)
print("\nLabels:\n", labels.shape)

# Call the compute_metrics function to calculate perplexity
metrics = compute_metrics((logits, labels))

# Print the computed perplexity
print("\nComputed Metrics:", metrics)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Logits:
 torch.Size([3, 5, 10])

Labels:
 torch.Size([3, 5])

Computed Metrics: {'perplexity': 20.857810974121094}
