In [1]:
# !pip install torch==2.0 datasets peft evaluate transformers[torch]

In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    # AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
# model_checkpoint = 'distilbert-base-uncased'
model_checkpoint = 'distilbert-base-uncased-distilled-squad'

# define label maps
# id2label = {0: "Negative", 1: "Positive"}
# label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
# model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [4]:
# dataset = load_dataset("shawhin/imdb-truncated")
# dataset

dataset = load_dataset("json", data_files = {'train': '/content/drive/MyDrive/work_chalenges/cazton/train_data.json', 'test': '/content/drive/MyDrive/work_chalenges/cazton/test_data.json'})

In [5]:
# create tokenize function
# def tokenize_function(examples):
#     # extract text
#     text = examples["text"]

#     #tokenize and truncate text
#     tokenizer.truncation_side = "left"
#     tokenized_inputs = tokenizer(
#         text,
#         return_tensors="np",
#         truncation=True,
#         max_length=512
#     )

#     return tokenized_inputs

# # add pad token if none exists
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))

# # tokenize training and validation datasets
# tokenized_dataset = dataset.map(tokenize_function, batched=True)
# tokenized_dataset


def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length = 384,
        truncation = "only_second",
        return_offsets_mapping = True,
        padding = "max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)


        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1


        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:

            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', add_prefix_space = True)
tokenized_dataset = dataset.map(preprocess_function, batched = True, remove_columns = dataset["train"].column_names)

In [7]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [8]:
# # import accuracy evaluation metric
# accuracy = evaluate.load("accuracy")

# # define an evaluation function to pass into trainer later
# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis = 1)

#     return {"accuracy": accuracy.compute(predictions=predictions,
#                                           references=labels)}

In [9]:
# define list of examples
# text_list = ["It was good.", "Not a fan, don't recommed.",
# "Better than the first one.", "This is not worth watching even once.",
# "This one is a pass."]

# print("Untrained model predictions:")
# print("----------------------------")
# for text in text_list:
#     # tokenize text
#     inputs = tokenizer.encode(text, return_tensors="pt")
#     # compute logits
#     logits = model(inputs).logits
#     # convert logits to label
#     predictions = torch.argmax(logits)

#     print(text + " - " + id2label[predictions.tolist()])

In [10]:
# peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
#                         r=4, # intrinsic rank of trainable weight matrix
#                         lora_alpha=32, # this is like a learning rate
#                         lora_dropout=0.01, # probablity of dropout
#                         target_modules = ['q_lin']) # we apply lora to query layer only

In [11]:
peft_config = LoraConfig(task_type = "QUESTION_ANS", # QnA
                        r = 4, # intrinsic rank of trainable weight matrix
                        lora_alpha = 32, # this is like a learning rate
                        lora_dropout = 0.01, # probablity of dropout
                        target_modules = ['q_lin']) # we apply lora to query layer only

In [12]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 38,402 || all params: 66,402,820 || trainable%: 0.057831881236369176


In [13]:
training_args = TrainingArguments(
    output_dir = "my_awesome_qa_model",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    num_train_epochs=  200,
    weight_decay = 0.01,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,6.185322
2,No log,6.116569
3,No log,6.043598
4,No log,5.965608
5,No log,5.882072
6,No log,5.792558
7,No log,5.69673
8,No log,5.594096
9,No log,5.48439
10,No log,5.367255


TrainOutput(global_step=1200, training_loss=1.0408022054036459, metrics={'train_runtime': 1967.2556, 'train_samples_per_second': 35.786, 'train_steps_per_second': 0.61, 'total_flos': 6904712547532800.0, 'train_loss': 1.0408022054036459, 'epoch': 200.0})

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# inputs = tokenizer(question = 'What is cazton', context = 'cazton', return_tensors = "pt")
inputs = tokenizer(text = 'What is cazton', return_tensors = "pt").to(model.device)

with torch.no_grad():
    outputs = trainer.model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'what is cazton'

In [None]:
# Install transformers from source - only needed for versions <= v4.34
# pip install git+https://github.com/huggingface/transformers.git
# pip install accelerate

import torch
from transformers import pipeline

pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])
# <|system|>
# You are a friendly chatbot who always responds in the style of a pirate.</s>
# <|user|>
# How many helicopters can a human eat in one sitting?</s>
# <|assistant|>
# Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]



In [14]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds professionally",
    },
    {"role": "user", "content": "What is Cazton?"},
]