#### Double checking we are using the GPU on the VSC

In [None]:
import torch

# Check if CUDA is available and set the device to GPU if it is
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#### Import the bookcorpus dataset

In [None]:
from datasets import load_dataset

save_path = "./data"

bookcorpus_dataset = load_dataset("bookcorpus", split="train[:5000]", cache_dir=save_path)
bookcorpus_dataset = bookcorpus_dataset.train_test_split(test_size=0.2)

In [None]:
bookcorpus_dataset["train"][0]

#### Select the model to fine-tune

In [None]:
# modelname = "bert-base-uncased"
# modelname = "roberta-base"
# modelname = "microsoft/deberta-base"
# modelname = "microsoft/deberta-v3-base"
# modelname = "google/electra-base-generator"
# modelname = "facebook/bart-base"
modelname = "gpt2"

### Preprocessing

Import the tokenizer

In [None]:
import transformers
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(modelname)

In [None]:
transformers.logging.set_verbosity_info()

Preprocessing Function 1 - Map the data to the tokenizer function

In [None]:
def preprocess_function(tokenizer, examples):
    return tokenizer([" ".join(x) for x in examples["text"]])

In [None]:
from functools import partial

partial_tokenize_function = partial(preprocess_function, tokenizer)

tokenized_bookcorpus = bookcorpus_dataset.map(
    partial_tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=bookcorpus_dataset["train"].column_names,
)

In [None]:
tokenized_bookcorpus

Tokenizer Function 2 - Divide the dataset into blocks of block size. Drop the remainder if the length of the dataset is not fully divisible to the block size.

In [None]:
def group_texts(examples):
    block_size = 128

    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_bookcorpus.map(group_texts, batched=True, num_proc=4)

Import a Data Collator Function for (Causal) LM. This function will ensure that for each token, we have the following token respective to it as it's label/target.

In [None]:
from transformers import DataCollatorForLanguageModeling

# tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

#### Import the LoRA library from PEFT. Set it's parameters and load the model optimized using LoRA

In [None]:
from peft import LoraConfig, TaskType, get_peft_model 

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8,
    lora_alpha=32, 
    lora_dropout=0.1)

We can see the reduced number of parameters below

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DebertaV2ForMaskedLM

model_without_peft = AutoModelForCausalLM.from_pretrained(modelname)
# model_without_peft = DebertaV2ForMaskedLM.from_pretrained(modelname)

model = get_peft_model(model_without_peft, peft_config)

model.print_trainable_parameters()
print(next(model.parameters()).device)

#### Set the Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=f"mymodels/{modelname}-peft",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    report_to="all",
    logging_dir='./logs',            
    logging_steps=100,
)

If the tokenizer doesn't have a padding token by default, use End of Sequence Token. If it also doesn't have that, then we have to use a Separator or a Classification token...

In [None]:
# tokenizer.pad_token = tokenizer.cls_token
# tokenizer.pad_token = tokenizer.eos_token

tokenizer.pad_token

Ensure that we are running the model on Gpu and not on Cpu

In [None]:
print(next(model.parameters()).device)

In [None]:
model.to(device)

In [None]:
print(next(model.parameters()).device)

#### Finally create the Trainer class and train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

##### Evaluate the model using Cosine Similarity, Pairwise Correlation...
Perplexity is just there as a placeholder for now

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Finally push the model to the Huggingface Hub

In [None]:
# trainer.save_model(f"{modelname}-peft")
# model.save_pretrained(f"{modelname}-peft-model")
trainer.push_to_hub()