In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
##################################################
## import packages
##################################################
!pip install datasets 

from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from datasets import load_dataset
import torch
import textwrap
import warnings
warnings.filterwarnings('ignore')


In [None]:
##################################################
## helper function (nicer printing)
##################################################

def pretty_print(s):
    print("Output:\n" + 80 * '-')
    print(textwrap.fill(tokenizer.decode(s, skip_special_tokens=True),80))

In [None]:
##################################################
## instantiating LLM & its tokenizer
##################################################

model_to_use = "gpt2"
# model_to_use = "gpt2-large"

print("Using model: ", model_to_use)

# get the tokenizer for the pre-trained LM you would like to use
tokenizer = GPT2TokenizerFast.from_pretrained(model_to_use)

# instantiate a model (causal LM)
model = GPT2LMHeadModel.from_pretrained(model_to_use,
                                        output_scores=True,
                                        pad_token_id=tokenizer.eos_token_id)

# inspecting the (default) model configuration
# (it is possible to created models with different configurations)
print(model.config)


In [40]:
##################################################
## autoregressive generation
##################################################

# text to expand
prompt = "After the campaign with the owl bear, I (a paladin) decided to elaborate ways to replenish my health in a clever way: "

# translate the prompt into tokens
input_tokens = tokenizer(prompt, return_tensors="pt").input_ids
print(input_tokens)

outputs = model.generate(input_tokens,
                         max_new_tokens=100,
                         do_sample=True,
                         top_k=50,
                       )

print("\nTop-k sampling:\n")
pretty_print(outputs[0])

tensor([[ 3260,   262,  1923,   351,   262, 39610,  6842,    11,   314,   357,
            64,  6340, 17072,     8,  3066,   284, 15962,  2842,   284, 35278,
           680,   616,  1535,   287,   257, 14169,   835,    25,   220]])

Top-k sampling:

Output:
--------------------------------------------------------------------------------
After the campaign with the owl bear, I (a paladin) decided to elaborate ways to
replenish my health in a clever way:  it seemed like I needed to give myself 10
pounds every day or so for 10 days and I was getting sick of taking it or being
so sickly, so I got a new one that made me want to eat it.  I would've been
tempted to make a more complex plan, but it worked out okay.  When I was eating
my favorite foods like kibble and kimchi and being a great person who wasn't
addicted to booze I could tell from eating my original


In [29]:
outputs = model.generate(input_tokens,
                         max_new_tokens=100,
                         num_beams=6,
                         no_repeat_ngram_size=4,
                         early_stopping=True
                         )

print("\nBeam search:\n")
pretty_print(outputs[0])



Beam search:

Output:
--------------------------------------------------------------------------------
The barbarian then decided to make a deal with the barbarian. The barbarian
agreed to pay the barbarian a certain amount of gold, and the barbarian would
give the barbarian the same amount of gold as he would give to the barbarian in
exchange for the barbarian's services.  The barbarian was then able to sell the
barbarian some of the gold he had received from the barbarian, and he was able
to sell it back to him.  After the barbarian had sold the gold


In [35]:
# Define the prompt for generating alternative actions
prompt = "But then the clever barbarian after encountering the dragon decided to"

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate alternative actions based on the prompt
outputs = model.generate(input_ids,
                         max_length=200,
                         num_return_sequences=3,  # Specify the number of alternative actions to generate
                         do_sample=True,
                         top_k=50,
                         top_p=0.95,
                         temperature=0.7,
                         early_stopping=True,
                         pad_token_id=tokenizer.eos_token_id,
                         num_beams=5,  # You can adjust the number of beams for beam search
                         )

# Decode and print the generated alternative actions
for i, output in enumerate(outputs):
    alternative_action = tokenizer.decode(output, skip_special_tokens=True)
    print(f"Alternative Action {i+1}: {alternative_action}")

Alternative Action 1: But then the clever barbarian after encountering the dragon decided to use his power to attack the dragon.

The barbarian was able to defeat the dragon by using his magic.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to attack the dragon.

The barbarian then used his power to
Alternative Action 2: But then the clever barbarian after encountering the dragon decided to use his power to attack the dragon.

The barbari

In [None]:
2) One-shot Learning using Pre-trained Model 

In [43]:
# Define the one-shot training data point
user_input = "Hello, how are you?"
expected_response = "I'm doing well, thank you for asking."

# Tokenize the data point
input_ids = tokenizer.encode(user_input, return_tensors="pt")
labels = tokenizer.encode(expected_response, return_tensors="pt")

# Ensure input and target sequences have the same length
max_length = max(input_ids.size(1), labels.size(1))
if input_ids.size(1) != max_length:
    input_ids = torch.cat([input_ids, input_ids.new_zeros((input_ids.size(0), max_length - input_ids.size(1)))], dim=-1)
if labels.size(1) != max_length:
    labels = torch.cat([labels, labels.new_zeros((labels.size(0), max_length - labels.size(1)))], dim=-1)

# Fine-tune the model with the one-shot data point
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(3):  # Adjust the number of epochs as needed
    optimizer.zero_grad()
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_chatbot_one_shot")
tokenizer.save_pretrained("./fine_tuned_chatbot_one_shot")

Non-default generation parameters: {'output_scores': True}


('./fine_tuned_chatbot_one_shot/tokenizer_config.json',
 './fine_tuned_chatbot_one_shot/special_tokens_map.json',
 './fine_tuned_chatbot_one_shot/vocab.json',
 './fine_tuned_chatbot_one_shot/merges.txt',
 './fine_tuned_chatbot_one_shot/added_tokens.json',
 './fine_tuned_chatbot_one_shot/tokenizer.json')

In [44]:
# Load the fine-tuned model and tokenizer
model_path = "./fine_tuned_chatbot_one_shot"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# User input
user_input = "Hello, how are you?"

# Tokenize the user input
input_ids = tokenizer.encode(user_input, return_tensors="pt")

# Generate a response using the fine-tuned model
output = model.generate(input_ids, max_length=100)

# Decode and print the generated response
response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Response:", response)

Generated Response: Hello, how are you?

I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm a little bit of a nerd. I'm


In [None]:
3) Pretraining using Custom Dataset 

In [48]:
from datasets import load_dataset
dataset = load_dataset("yelp_review_full")


Downloading readme: 100%|██████████| 6.72k/6.72k [00:00<00:00, 19.4MB/s]
Downloading data: 100%|██████████| 299M/299M [00:02<00:00, 121MB/s]  
Downloading data: 100%|██████████| 23.5M/23.5M [00:00<00:00, 92.5MB/s]
Generating train split: 100%|██████████| 650000/650000 [00:03<00:00, 193277.02 examples/s]
Generating test split: 100%|██████████| 50000/50000 [00:00<00:00, 165213.08 examples/s]


In [49]:
print(dataset["train"][100])

{'label': 0, 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I

In [51]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 650000/650000 [04:03<00:00, 2674.50 examples/s]
Map: 100%|██████████| 50000/50000 [00:19<00:00, 2564.25 examples/s]


In [67]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [68]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [69]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [71]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
#         print(f'batch: {batch}')
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()