In [1]:
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import kaggle

from transformers import AutoTokenizer, AutoModelForCausalLM,  DataCollatorForSeq2Seq
from datasets import load_dataset

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
# load dataset, trim train and val for easier computation
dataset = load_dataset("squad")
traindata = dataset["train"].shuffle().select(range(10000))
valdata = dataset["validation"].shuffle().select(range(1000))

In [13]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [14]:
# add extra special characters

tokenizer.add_special_tokens({"bos_token": "<|bos|>",
                              "eos_token": "<|eos|>",
                              "unk_token": "<|unk|>",
                              "sep_token": "<|sep|>",
                              "pad_token": "<|pad|>",
                              "cls_token": "<|cls|>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(50263, 768)

In [15]:
traindata

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10000
})

In [16]:
from tqdm import tqdm
for t in tqdm(traindata):
    if len(t['answers']['text']) > 1:
        print(t)
# traindata[0]['answers']["text"]

100%|██████████| 10000/10000 [00:01<00:00, 8711.65it/s]


In [17]:
model.to('cuda')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50263, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [18]:
default_prompt = tokenizer(traindata[0]["context"] + "<|bos|>" + traindata[0]["question"] + "<|sep|>" + traindata[0]["answers"]["text"][0] + "<|eos|>",
                           padding=True, truncation=True)
default_label = tokenizer("<|pad|>" * (len(traindata[0]["context"] + "<|bos|>" + traindata[0]["question"]) - 1)
                          + "<|sep|>" + traindata[0]["answers"]["text"][0] + "<|eos|>" 
                          + "<|pad|>",
                          padding=True, truncation=True)

In [19]:
type(default_prompt)

transformers.tokenization_utils_base.BatchEncoding

In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [71]:
# process all examples in the dataset w/ huggingfaces dataset.map()

def tokenize_function(item):
       lengths = [len(tokenizer(context  + "<|bos|>" + q + "<|sep|>",
                  truncation=True)['input_ids']) \
              for (context, q) in zip(item["context"], item["question"])]


       default_prompt = [tokenizer(context + "<|bos|>" + q + "<|sep|>" + a,
                            truncation=True) \
                     for (context, q, a) in zip(item["context"], item["question"], [k["text"][0] for k in item["answers"]])]
       # default_label = [tokenizer("<|pad|> " * (len(context + "<|bos|>" + q) - 1)

       default_label = [tokenizer(context + "<|bos|>" + q + "<|sep|>" + a + "<|eos|>",
                            truncation=True) \
                     for (context, q, a) in zip(item["context"], item["question"], [k["text"][0] for k in item["answers"]])]

       for i, l in enumerate(lengths):
              default_label[i]["input_ids"][:l] = [-100 for i in range(l)]

       ret = {"input_ids": [i["input_ids"] for i in default_prompt],
              "attention_mask": [a["attention_mask"] for a in default_prompt],
              "labels": [i["input_ids"][1:] for i in default_label]}         # shift labels by one

       return ret


In [72]:
t = valdata.map(tokenize_function, batched=True, remove_columns=valdata.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [73]:
tokenized_traindata = traindata.map(tokenize_function, batched=True, remove_columns=traindata.column_names)
tokenized_valdata = valdata.map(tokenize_function, batched=True, remove_columns=valdata.column_names)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
tokenized_traindata

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [None]:
tokenized_valdata

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [None]:
data_collator =  DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [None]:
samples = {k: v for k,v in tokenized_traindata[:8].items()}

for i in ['input_ids', 'attention_mask', 'labels']:
    print([x for x in samples[i]])

[[8332, 262, 4238, 4633, 1803, 11, 1811, 9293, 423, 1813, 262, 1080, 845, 922, 8088, 4632, 5115, 663, 6890, 13, 327, 12884, 1578, 7526, 15342, 262, 1080, 2282, 11, 366, 1169, 6599, 18, 318, 257, 21362, 290, 8036, 3704, 286, 1363, 12, 298, 1425, 434, 5112, 326, 3160, 510, 284, 262, 20606, 26894, 262, 6599, 18, 318, 880, 2861, 663, 29784, 2756, 7621, 526, 327, 12884, 11343, 340, 257, 4776, 286, 807, 13, 23, 503, 286, 838, 290, 7052, 340, 355, 663, 1271, 530, 366, 27238, 12, 14150, 1, 42892, 11, 33557, 663, 12373, 27831, 9889, 290, 30511, 20897, 1486, 981, 26816, 663, 3614, 6356, 286, 1695, 1830, 13, 554, 3090, 11, 1111, 5995, 22207, 11175, 290, 11165, 14661, 423, 1813, 262, 1080, 338, 12391, 12, 2433, 16388, 845, 17070, 8088, 11, 12316, 326, 262, 3081, 286, 16388, 21695, 326, 286, 867, 1459, 27669, 12391, 12, 2433, 8444, 1938, 13, 50257, 2061, 3052, 531, 262, 14047, 513, 366, 75, 1083, 510, 284, 262, 20606, 13984, 220, 50260, 34, 12884, 1578, 7526], [24472, 31982, 11, 257, 1964, 3356, 41

In [None]:
batch = data_collator([tokenized_valdata[i] for i in range(8)])
batch
{k: v.shape for k,v in batch.items()}

{'input_ids': torch.Size([8, 267]),
 'attention_mask': torch.Size([8, 267]),
 'labels': torch.Size([8, 267])}

In [None]:
# prep for training

tokenized_traindata.set_format("torch")
tokenized_valdata.set_format("torch")


In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_traindata, shuffle=True, batch_size=1, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_valdata, batch_size=1, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([1, 169]),
 'attention_mask': torch.Size([1, 169]),
 'labels': torch.Size([1, 169])}

In [None]:
torch.cuda.empty_cache()
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Wed Nov 15 15:53:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 23%   27C    P2    56W / 250W |   2109MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:03:00.0 Off |                  N/A |
| 23%   25C    P8     8W / 250W |      2MiB / 11264MiB |      0%      Default |
|       

In [None]:
outputs = model(**batch.to(device))
print(outputs.loss, outputs.logits.shape)
outputs
# model.transformer.wte

tensor(66.8771, device='cuda:0', grad_fn=<NllLossBackward0>) torch.Size([1, 169, 50263])


CausalLMOutputWithCrossAttentions(loss=tensor(66.8771, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[[-62.4476, -56.4331, -61.8573,  ...,  -9.1667, -11.4215,  -7.9637],
         [-57.2493, -53.6497, -62.5995,  ...,  -9.8462, -12.4266,  -8.5500],
         [-67.1257, -61.1422, -66.2666,  ..., -11.9947, -12.4417,  -8.8907],
         ...,
         [-23.9955, -14.2964, -26.4468,  ...,  -4.5412,  -7.2286,  -4.1709],
         [-20.6872,  -9.9268, -24.3683,  ...,  -4.8183,  -6.7345,  -3.0585],
         [-62.0287, -59.1233, -65.1496,  ...,  -8.5997, -15.0292,  -8.7982]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.6044,  1.5019,  0.8428,  ..., -1.1459, -0.3331,  1.1480],
          [-1.5426,  1.5870,  0.8776,  ..., -1.2772, -0.3419,  0.6149],
          [-2.1703,  1.4136,  2.3538,  ..., -0.3721, -1.1331,  2.1192],
          ...,
          [-0.6799,  1.6874, -0.1866,  ..., -1.3821,  0.8103, -0.0079],
          [-0.0137,  1.5672,  1.5908,

In [None]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

30000




In [None]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50263, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [None]:
for p in model.parameters():
    p.requires_grad = False
#     print(name)
#     # break
# # model.lm_head.requires_grad = True
for p in model.lm_head.parameters():
    p.requires_grad = True

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50263, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/30000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch['input_ids'].shape)
    break

torch.Size([1, 269])


In [None]:
from torch import nn

class PrefixTunedGPT2(nn.Module):
    def __init__(self, num_prompts, model, tokenizer):
        super(PrefixTunedGPT2, self).__init__()
        self.soft_prompt = nn.Embedding(num_prompts, 768)
        self.model = model
        self.tokenizer = tokenizer

        for p in self.model.parameters():
            p.requires_grad = False

    def forward(self, input_ids, attention_mask, labels):
        # get all embs
        



IndentationError: expected an indented block (2118087392.py, line 16)

In [None]:
b = batch
# print(b)
print(model.transformer.wte(torch.Tensor([1]).int().cuda()))

RuntimeError: numel: integer multiplication overflow

In [None]:
ptgpt2 = PrefixTunedGPT2(4, model, tokenizer)

In [None]:
"""
{'input_ids': torch.Size([1, 155]),
 'attention_mask': torch.Size([1, 155]),
 'labels': torch.Size([1, 155])}
"""

ptgpt2

PrefixTunedGPT2(
  (soft_prompt): Embedding(4, 768)
  (model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50263, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0): GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
  