In [1]:
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import kaggle

from transformers import AutoTokenizer, AutoModelForCausalLM,  DataCollatorForSeq2Seq
from datasets import load_dataset

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# load dataset, trim train and val for easier computation
dataset = load_dataset("squad")
traindata = dataset["train"].shuffle().select(range(10000))
valdata = dataset["validation"].shuffle().select(range(1000))

In [4]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [5]:
# add extra special characters

tokenizer.add_special_tokens({"bos_token": "<|bos|>",
                              "eos_token": "<|eos|>",
                              "unk_token": "<|unk|>",
                              "sep_token": "<|sep|>",
                              "pad_token": "<|pad|>",
                              "cls_token": "<|cls|>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(50263, 768)

In [6]:
traindata

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10000
})

In [7]:
from tqdm import tqdm
for t in tqdm(traindata):
    if len(t['answers']['text']) > 1:
        print(t)
# traindata[0]['answers']["text"]

100%|██████████| 10000/10000 [00:01<00:00, 7736.03it/s]


In [8]:
model.to('cuda')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50263, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [9]:
default_prompt = tokenizer(traindata[0]["context"] + "<|bos|>" + traindata[0]["question"] + "<|sep|>" + traindata[0]["answers"]["text"][0] + "<|eos|>",
                           padding=True, truncation=True)
default_label = tokenizer("<|pad|>" * (len(traindata[0]["context"] + "<|bos|>" + traindata[0]["question"]) - 1)
                          + "<|sep|>" + traindata[0]["answers"]["text"][0] + "<|eos|>" 
                          + "<|pad|>",
                          padding=True, truncation=True)

In [10]:
type(default_prompt)

transformers.tokenization_utils_base.BatchEncoding

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [12]:
# process all examples in the dataset w/ huggingfaces dataset.map()

def tokenize_function(item):
       lengths = [len(tokenizer(context  + "<|bos|>" + q + "<|sep|>",
                  truncation=True)['input_ids']) \
              for (context, q) in zip(item["context"], item["question"])]


       default_prompt = [tokenizer(context + "<|bos|>" + q + "<|sep|>" + a,
                            truncation=True) \
                     for (context, q, a) in zip(item["context"], item["question"], [k["text"][0] for k in item["answers"]])]
       # default_label = [tokenizer("<|pad|> " * (len(context + "<|bos|>" + q) - 1)

       default_label = [tokenizer(context + "<|bos|>" + q + "<|sep|>" + a + "<|eos|>",
                            truncation=True) \
                     for (context, q, a) in zip(item["context"], item["question"], [k["text"][0] for k in item["answers"]])]

       for i, l in enumerate(lengths):
              # default_label[i]["input_ids"][:l] = [tokenizer.pad_token_id for i in range(l)]
              default_label[i]["input_ids"][:l] = [-100 for i in range(l)]

       ret = {"input_ids": [i["input_ids"] for i in default_prompt],
              "attention_mask": [a["attention_mask"] for a in default_prompt],
              "labels": [i["input_ids"][1:] for i in default_label]}         # shift labels by one

       return ret


In [13]:
t = valdata.map(tokenize_function, batched=True, remove_columns=valdata.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
tokenized_traindata = traindata.map(tokenize_function, batched=True, remove_columns=traindata.column_names)
tokenized_valdata = valdata.map(tokenize_function, batched=True, remove_columns=valdata.column_names)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
tokenized_traindata

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [16]:
tokenized_valdata

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [17]:
data_collator =  DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [18]:
samples = {k: v for k,v in tokenized_traindata[:8].items()}

for i in ['input_ids', 'attention_mask', 'labels']:
    print([x for x in samples[i]])

[[39424, 262, 15993, 2647, 11, 340, 373, 832, 262, 314, 27625, 14807, 357, 4758, 547, 739, 8830, 3896, 290, 4588, 8, 326, 477, 262, 1688, 14901, 286, 262, 8830, 3427, 15993, 2647, 547, 5495, 284, 22779, 25059, 13, 383, 3814, 318, 12411, 329, 262, 4082, 286, 262, 717, 3961, 286, 3660, 8312, 15993, 2647, 357, 1544, 457, 272, 2771, 272, 393, 314, 27625, 3961, 11, 8312, 25, 7377, 243, 46582, 32830, 17394, 26180, 138, 115, 38392, 29945, 17394, 43000, 138, 106, 7377, 96, 139, 229, 26517, 39377, 138, 106, 828, 4920, 287, 1248, 1314, 13, 10335, 7233, 10826, 286, 428, 12121, 2291, 47817, 418, 36987, 41046, 418, 11, 1338, 2417, 47287, 1395, 88, 358, 292, 11, 1338, 2417, 47287, 34077, 292, 290, 24081, 33280, 1879, 11751, 13, 1869, 8506, 12612, 296, 29616, 318, 3177, 262, 9119, 286, 262, 8312, 2351, 3961, 286, 7849, 13, 50257, 2215, 373, 262, 717, 1524, 286, 3660, 8312, 15993, 2647, 7042, 30, 50260, 1507, 1314], [23672, 13345, 72, 373, 20056, 284, 262, 2457, 11100, 286, 262, 369, 44281, 379, 604, 

In [19]:
batch = data_collator([tokenized_valdata[i] for i in range(8)])
batch
{k: v.shape for k,v in batch.items()}

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 239]),
 'attention_mask': torch.Size([8, 239]),
 'labels': torch.Size([8, 239])}

In [20]:
# prep for training

tokenized_traindata.set_format("torch")
tokenized_valdata.set_format("torch")


In [21]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_traindata, shuffle=True, batch_size=1, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_valdata, batch_size=1, collate_fn=data_collator
)

In [22]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([1, 153]),
 'attention_mask': torch.Size([1, 153]),
 'labels': torch.Size([1, 153])}

In [23]:
torch.cuda.empty_cache()
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Wed Nov 15 21:58:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 23%   35C    P5    13W / 250W |   1049MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:03:00.0 Off |                  N/A |
| 23%   33C    P8     9W / 250W |      2MiB / 11264MiB |      0%      Default |
|       

In [24]:
outputs = model(**batch.to(device))
print(outputs.loss, outputs.logits.shape)
# outputs
# model.transformer.wte

tensor(59.5894, device='cuda:0', grad_fn=<NllLossBackward0>) torch.Size([1, 153, 50263])


In [25]:
model.to(device);

In [26]:
for p in model.parameters():
    p.requires_grad = True
#     print(name)
#     # break
# # model.lm_head.requires_grad = True
for p in model.lm_head.parameters():
    p.requires_grad = True

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50263, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [27]:
"""
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
""";

In [28]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    for k in batch.keys():
        print(k, batch[k].size(), batch[k].type())
    break

input_ids torch.Size([1, 120]) torch.cuda.LongTensor
attention_mask torch.Size([1, 120]) torch.cuda.LongTensor
labels torch.Size([1, 120]) torch.cuda.LongTensor


In [63]:
from torch import nn
from copy import copy

class PrefixTunedGPT2(nn.Module):
    def __init__(self, num_prompts, model, tokenizer, init_prompts=None):
        super(PrefixTunedGPT2, self).__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.soft_prompt_length = num_prompts
        self.soft_prompt = nn.Embedding(num_prompts, 768)
        self.model = model
        self.tokenizer = tokenizer

        # init
        if init_prompts != None and len(init_prompts.split(' ')) == num_prompts:
            idx = torch.tensor([[i[0]] for i in self.tokenizer(init_prompts.split(' '))['input_ids']]).int().flatten().to(self.device)
            print("idx: ", idx)
            self.soft_prompt.weights = self.model.transformer.wte(idx)

        # freeze the internal model
        for p in self.model.parameters():
            p.requires_grad = False

    def forward(self, input_ids, attention_mask, labels):
        # get all embs
        batch_size = input_ids.shape[0]     # assuming batch first

        # batch x N x emb_size
        soft_prompt = self.soft_prompt(torch.arange(self.soft_prompt_length).to(self.device))
        soft_prompt = torch.tile(soft_prompt.unsqueeze(0), (batch_size, 1, 1))

        input_embs = self.model.transformer.wte(input_ids)
        input_embs += self.model.transformer.wpe(torch.arange(input_ids.shape[-1]).to(self.device))

        # k = copy(input_embs)

        # apppend the soft prompt
        input_embs = torch.cat((soft_prompt, input_embs), -2)
        attention_mask = torch.cat((torch.ones((batch_size, self.soft_prompt_length)).int().to(self.device), attention_mask), -1)
        labels = torch.cat(((
            torch.ones((batch_size, self.soft_prompt_length)).long() * -100 
                            ).to(self.device), input_ids), -1)
        
        # print("Shapes: ", input_embs.shape, attention_mask.shape, labels.shape, " | ", input_ids.shape)
        # print("nan check (should be false): ", input_embs.isnan().any(), attention_mask.isnan().any(), labels.isnan().any())
        # print("attention mask:", attention_mask, attention_mask.dtype)

        outputs = self.model(input_ids=None, attention_mask=attention_mask, labels=labels, inputs_embeds=input_embs)
        # return outputs, soft_prompt, input_embs, k
        return outputs

    def check_in(self):
        k = (self.soft_prompt(torch.arange(self.soft_prompt_length).to(self.device))[:,:6])
        return k
        



In [64]:
ptgpt2 = PrefixTunedGPT2(4, model, tokenizer, "question answer help result").to(device)

idx:  tensor([25652, 41484, 16794, 20274], device='cuda:0', dtype=torch.int32)


In [65]:
tokenizer("question answer help result".split(' '))['input_ids']

[[25652], [41484], [16794], [20274]]

In [66]:
# outputs, soft, inp, k = ptgpt2(**batch)
outputs = ptgpt2(**batch)

In [67]:
print(outputs.loss)
outputs

tensor(98.1544, device='cuda:0', grad_fn=<NllLossBackward0>)


CausalLMOutputWithCrossAttentions(loss=tensor(98.1544, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[[ -55.7074,  -56.3966,  -60.9171,  ...,    3.8213,   -3.3285,
            -2.8394],
         [ -69.5818,  -69.1536,  -72.6581,  ...,    4.3278,   -4.5861,
            -3.2870],
         [ -60.4988,  -58.0883,  -61.7677,  ...,    3.6855,   -4.0858,
            -2.9560],
         ...,
         [-111.2705, -111.0775, -118.0889,  ...,    7.5938,   -5.4514,
            -3.8237],
         [ -59.1710,  -59.6123,  -64.5233,  ...,    4.1447,   -4.1973,
            -2.6460],
         [ -64.9427,  -66.9064,  -70.5449,  ...,    4.2045,   -4.5302,
            -2.3400]]], device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-1.4171, -0.9430,  1.1613,  ...,  1.7712, -0.5406,  0.7919],
          [ 1.1553, -1.0167, -0.6283,  ..., -3.3433, -1.0641, -0.9319],
          [ 2.3112, -1.4542, -1.0587,  ...,  2.8316, -1.0940, -1.2160],
          ...,
          [-1.4621, 

In [70]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-3)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

30000


In [77]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

lastprompt = ptgpt2.check_in()

for epoch in range(num_epochs):
    avg_loss = torch.tensor([0], requires_grad=False).float().cuda()
    for i, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = ptgpt2(**batch)
        loss = outputs.loss
        loss.backward()

        # if(i%10 == 0):
        #     print(ptgpt2.soft_prompt.weight.grad)

        if(i % 500 == 0):
            print(f"Epoch: %d | Step: %d | Loss: %f" % (epoch, i, loss))
            newprompt = ptgpt2.check_in()
            print("Diff in soft embs: ", torch.sum(newprompt-lastprompt).item())
            lastprompt = newprompt

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(ptgpt2.parameters(), max_norm=1.0)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        avg_loss += loss.item()

    print(avg_loss/len(train_dataloader))



  0%|          | 0/30000 [00:00<?, ?it/s]

Epoch: 0 | Step: 0 | Loss: 112.278938
Diff in soft embs:  0.0
Epoch: 0 | Step: 1000 | Loss: 98.665703
Diff in soft embs:  0.0
Epoch: 0 | Step: 2000 | Loss: 108.133545
Diff in soft embs:  0.0
Epoch: 0 | Step: 3000 | Loss: 101.336281
Diff in soft embs:  0.0
Epoch: 0 | Step: 4000 | Loss: 103.810539
Diff in soft embs:  0.0


KeyboardInterrupt: 