#### Double checking we are using the GPU on the VSC

In [None]:
import torch
import os

# Check if CUDA is available and set the device to GPU if it is
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# os.environ["WANDB_DISABLED"] = "true"

#### Import the bookcorpus dataset

In [None]:
from datasets import load_dataset

save_path = "./data"

bookcorpus_dataset = load_dataset("bookcorpus", split="train[:5000]", cache_dir=save_path)
bookcorpus_dataset = bookcorpus_dataset.train_test_split(test_size=0.2)

In [None]:
bookcorpus_dataset

In [None]:
bookcorpus_dataset["train"]['text']

In [None]:
# import spacy
# from spacy.lang.en import English
# import random

# # Load the spaCy model for English
# nlp = spacy.load("en_core_web_sm")

# def generate_question_answer_pairs(sentences):
#     """
#     Generate question-answer pairs from a list of sentences using Named Entity Recognition for identifying answers
#     and a simple template for question generation.
#     """
#     qa_pairs = []
#     for sentence in sentences:
#         doc = nlp(sentence)
#         entities = [ent for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'DATE', 'GPE', 'EVENT']]
#         for ent in entities:
#             # # Customize the question based on the entity type for better relevance
#             # if ent.label_ == 'PERSON':
#             #     question = f"Who is {ent.text}?"
#             # elif ent.label_ == 'DATE':
#             #     question = f"When did it happen?"
#             # elif ent.label_ in ['ORG', 'GPE', 'EVENT']:
#             #     question = f"What is mentioned about {ent.text}?"
#             # else:
#             question = f"What is mentioned about {ent.text}?"
#             answer = sentence.replace(ent.text, '')
#             qa_pairs.append({"context": sentence, "question": question, "answer": answer})
#     return qa_pairs

In [None]:
# # Example usage:
# sentences = bookcorpus_dataset["train"]['text']

# # Generate question-answer pairs
# qa_pairs = generate_question_answer_pairs(sentences)

# for pair in qa_pairs:
#     print("Context:", pair["context"])
#     print("Question:", pair["question"])
#     print("Answer:", pair["answer"])
#     print("----")

#### Select the model to fine-tune

In [None]:
modelname = "microsoft/deberta-v3-base"
# modelname = "google/electra-base-generator"

# modelname = "bert-base-uncased"
# modelname = "gpt2"
# modelname = "roberta-base"
# modelname = "microsoft/deberta-base"
# modelname = "facebook/bart-base"

### Preprocessing

Import the tokenizer

In [None]:
import transformers
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(modelname)

In [None]:
transformers.logging.set_verbosity_info()

##### Token Classification Data

In [None]:
# from datasets import load_dataset

# wnut = load_dataset("wnut_17")

##### Preprocessing for Token Classification

In [None]:
# def tokenize_and_align_labels(examples):
#     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

#     labels = []
#     for i, label in enumerate(examples[f"ner_tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

In [None]:
# tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

Preprocessing Function 1 - Map the data to the tokenizer function

In [None]:
def preprocess_function(tokenizer, examples):
    return tokenizer([" ".join(x) for x in examples["text"]])

In [None]:
from functools import partial

partial_tokenize_function = partial(preprocess_function, tokenizer)

tokenized_bookcorpus = bookcorpus_dataset.map(
    partial_tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=bookcorpus_dataset["train"].column_names,
)

In [None]:
bookcorpus_dataset["train"].column_names

In [None]:
tokenized_bookcorpus['train'][0]

Tokenizer Function 2 - Divide the dataset into blocks of block size. Drop the remainder if the length of the dataset is not fully divisible to the block size.

In [None]:
def group_texts(examples):
    block_size = 128

    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_bookcorpus.map(group_texts, batched=True, num_proc=4)
lm_dataset

#### Import the LoRA library from PEFT. Set it's parameters and load the model optimized using LoRA

In [None]:
from peft import LoraConfig, TaskType, get_peft_model 

peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    inference_mode=False, 
    r=8,
    lora_alpha=32, 
    lora_dropout=0.1,
)

We can see the reduced number of parameters below

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DebertaV2ForMaskedLM, DebertaV2ForQuestionAnswering
from transformers import DebertaV2ForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM
from transformers import EncoderDecoderModel, AutoModelForSequenceClassification, AutoModelForTokenClassification

# from simpletransformers.seq2seq import Seq2SeqModel

# model_without_peft = EncoderDecoderModel.from_encoder_decoder_pretrained(modelname, modelname)

# model_without_peft = Seq2SeqModel(
#     encoder_type="auto",
#     encoder_decoder_name=modelname,
#     use_cuda=True,
# )

# model_without_peft = AutoModelForCausalLM.from_pretrained(modelname)
# model_without_peft = AutoModelForQuestionAnswering.from_pretrained(modelname)
# model_without_peft = AutoModelForSeq2SeqLM.from_pretrained(modelname)
# model_without_peft = AutoModelForSequenceClassification.from_pretrained(modelname)
model_without_peft = AutoModelForTokenClassification.from_pretrained(modelname)


# model_without_peft = DebertaV2ForMaskedLM.from_pretrained(modelname)
# model_without_peft = DebertaV2ForQuestionAnswering.from_pretrained(modelname)
# model_without_peft = DebertaV2ForSequenceClassification.from_pretrained(modelname)



model = get_peft_model(model_without_peft, peft_config)

model.print_trainable_parameters()
print(next(model.parameters()).device)

##### Import a Data Collator Function for (Causal) LM. This function will ensure that for each token, we have the following token respective to it as it's label/target.

In [None]:
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, DataCollatorForTokenClassification
from transformers import DataCollatorForPermutationLanguageModeling, DataCollatorWithPadding, default_data_collator
from transformers import DataCollatorForSOP, DataCollatorForWholeWordMask


# tokenizer.pad_token = tokenizer.eos_token
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)

### Necessary modifications for Question Answering Task

These use the 'question', 'context' and 'answer' columns that are generated during preprocessing before. We tokenize those columns, and pad them accordingly to have the same length. We also use the DataLoader class to give it as an input for the ML model.

In [None]:
# def ask_question(question, context):
#     """
#     Use the pre-trained ELECTRA model to infer an answer to a question given some context.
#     This function doesn't require explicit answer annotations.
#     """
#     inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
#     input_ids = inputs["input_ids"].tolist()[0]

#     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
#     outputs = model(**inputs)
#     answer_start_scores = outputs.start_logits
#     answer_end_scores = outputs.end_logits

#     # Get the most likely beginning and end of answer with the argmax of the score
#     answer_start = torch.argmax(answer_start_scores)
#     answer_end = torch.argmax(answer_end_scores) + 1

#     # Convert the tokens to the answer string
#     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    
#     return answer

# # Example usage
# context = "The history of natural language processing (NLP) generally started in the 1950s, although work can be found from earlier periods."
# question = "When did NLP start?"

# answer = ask_question(question, context)
# print("Question:", question)
# print("Answer:", answer)

In [None]:
# ### Question Answering Preprocessing

# def tokenize_and_preserve_labels(qa_pair, tokenizer):
#     # Tokenize question and context together
#     inputs = tokenizer.encode_plus(
#         qa_pair["question"],
#         qa_pair["context"],
#         add_special_tokens=True,
#         max_length=512,
#         padding="max_length",
#         truncation=True,
#         return_offsets_mapping=True,  # Important for mapping token positions to original text
#         return_tensors="pt"
#     )

#     # Find start and end of answer in tokens
#     text = qa_pair["context"]
#     answer = qa_pair["answer"]
#     start_char = text.find(answer)
#     end_char = start_char + len(answer) - 1

#     # Map character positions to token positions
#     offsets = inputs["offset_mapping"][0].tolist()  # Get the offsets
#     answer_token_start, answer_token_end = 0, 0

#     # Find tokens that start and end the answer
#     for i, offset in enumerate(offsets):
#         if start_char >= offset[0] and start_char <= offset[1]:
#             answer_token_start = i
#         if end_char >= offset[0] and end_char <= offset[1]:
#             answer_token_end = i
#             break

#     # Remove offset mapping to avoid issues during model training
#     inputs.pop("offset_mapping")

#     return inputs, answer_token_start, answer_token_end

# # Example usage

# from sklearn.model_selection import train_test_split

# train_qa_pairs, test_qa_pairs = train_test_split(qa_pairs, test_size=0.2, random_state=42)

# train_tokenized = [tokenize_and_preserve_labels(pair, tokenizer) for pair in train_qa_pairs]
# test_tokenized = [tokenize_and_preserve_labels(pair, tokenizer) for pair in test_qa_pairs]

# # Now `tokenized_data` contains tokenized inputs along with the start and end positions of the answers


In [None]:
# from torch.utils.data import Dataset, DataLoader

# class QADataset(Dataset):
#     def __init__(self, tokenized_data):
#         self.tokenized_data = tokenized_data

#     def __len__(self):
#         return len(self.tokenized_data)

#     # def __getitem__(self, idx):
#     #     return self.tokenized_data[idx]

#     def __getitem__(self, idx):
#         input_ids = self.tokenized_data[idx][0]["input_ids"].squeeze()  # Remove batch dimension
#         attention_mask = self.tokenized_data[idx][0]["attention_mask"].squeeze()
#         start_positions = torch.tensor(self.tokenized_data[idx][1])
#         end_positions = torch.tensor(self.tokenized_data[idx][2])
        
#         return {
#             "input_ids": input_ids,
#             "attention_mask": attention_mask,
#             "start_positions": start_positions,
#             "end_positions": end_positions
#         }

# train_dataset = QADataset(train_tokenized)
# test_dataset = QADataset(test_tokenized)

If the tokenizer doesn't have a padding token by default, use End of Sequence Token. If it also doesn't have that, then we have to use a Separator or a Classification token...

In [None]:
# tokenizer.pad_token = tokenizer.cls_token
# tokenizer.pad_token = tokenizer.eos_token

tokenizer.pad_token

Ensure that we are running the model on Gpu and not on Cpu

In [None]:
# print(next(model.parameters()).device)

In [None]:
# model.to(device)

In [None]:
print(next(model.parameters()).device)

#### IMDB Database Stuff for Sequence Classification

In [None]:
# from datasets import load_dataset

# def preprocess_function2(examples):
#     return tokenizer(examples["text"], truncation=True)

# imdb = load_dataset("imdb")

# tokenized_imdb = imdb.map(preprocess_function2, batched=True)

# tokenized_imdb

In [None]:
# def rename_label_to_labels(example):
#     # This function will be applied to each example. It simply copies the value from 'label' to 'labels'.
#     example['labels'] = example['label']
#     return example

# # Apply the function across all splits in the dataset
# tokenized_imdb = tokenized_imdb.map(rename_label_to_labels, remove_columns=['label'])
# tokenized_imdb

#### Set the Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=f"mymodels/{modelname}-TokenClassificationWnut",
    evaluation_strategy="epoch",
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    report_to="all",
    logging_dir='./logs',
    logging_steps=100,
)

#### Finally create the Trainer class and train the model

In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
import os

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut['train'],
    eval_dataset=tokenized_wnut['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
print(f"Perplexity: {(eval_results['eval_loss']):.2f}")

Finally push the model to the Huggingface Hub

In [None]:
# trainer.save_model(f"{modelname}-peft")
# model.save_pretrained(f"{modelname}-peft-model")
trainer.push_to_hub()