In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

In [2]:
import sys

sys.path.append("/home/vmeshchaninov/DiffusionTextGeneration-cond-ca/")

In [3]:
from data.dataset_clean_wiki import WikipediaCleanDatasetUnconditional

In [4]:
import torch
from torch.utils.data import DataLoader

In [5]:
from torch.nn.functional import cross_entropy

In [6]:
from lightning import seed_everything, Trainer

In [7]:
import lightning as L

# GPT-2

In [3]:
config = AutoConfig.from_pretrained("gpt2")

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [41]:
tokenizer.pad_token_id = 50256

In [68]:
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

In [42]:
dataset = next(WikipediaCleanDatasetUnconditional(
    split="test",
    tokenizer=tokenizer,
    max_sequence_len=128,
).get_data())

In [84]:
loader = DataLoader(dataset, batch_size=2)

In [48]:
X = next(loader)

In [70]:
logits = gpt2(**X).logits

In [71]:
logits.shape

torch.Size([2, 128, 50257])

In [72]:
target = X["input_ids"]
mask = X["attention_mask"]

tensor(4.1708, grad_fn=<DivBackward0>)

In [128]:
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.29.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [124]:
class GPTModel(L.LightningModule):
    def __init__(self,):
        super(GPTModel, self).__init__()
        # Model Architecture
        self.gpt_config = AutoConfig.from_pretrained("gpt2")
        self.model = AutoModelForCausalLM.from_config(self.gpt_config)
    
    def recon_loss(self, inputs, outputs, mask=None):
        if mask is None:
            mask = torch.ones(
                (inputs.shape[0], inputs.shape[1]),
                requires_grad=False,
                dtype=torch.int64,
            )
        
        losses = cross_entropy(
            input=inputs.reshape(-1, inputs.shape[-1]),
            target=outputs.reshape(-1),
            reduce=False,
        )
        losses = losses * mask.reshape(-1)
        loss = torch.sum(losses) / torch.sum(mask)
        return loss
    
    def get_loss(self, logits, targets, mask):
        loss = self.recon_loss(logits[:, :-1], targets[:, 1:], mask[:, 1:])
        return loss
    
    def forward(self, X):
        logits = self.model(**X).logits
        return logits
    
    def training_step(self, batch):
        target = batch["input_ids"]
        mask = batch["attention_mask"]
        
        logits = self.forward(batch)
        loss = self.get_loss(logits, target, mask)
        
        logs = {'loss': loss}
        # if self.config.wandb:
        #     wandb_log(loss=loss.item())
        return {'loss': loss, 'log': logs}
    
    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.model.parameters(), lr=1e-4, weight_decay=0)
        return [opt], []

In [125]:
trainer = Trainer()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [126]:
model = GPTModel()

In [127]:
trainer.fit(model, train_dataloaders=loader)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 124 M 
------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Training: 0it [00:00, ?it/s]

# BERT

In [13]:
from transformers import DataCollatorForLanguageModeling, BertForMaskedLM

In [10]:
config = AutoConfig.from_pretrained("bert-base-uncased")

In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [15]:
bert = BertForMaskedLM(config)

In [16]:
dataset = next(WikipediaCleanDatasetUnconditional(
    split="test",
    tokenizer=tokenizer,
    max_sequence_len=128,
).get_data())

In [21]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15,
    pad_to_multiple_of=3,
)

In [22]:
loader = DataLoader(
    dataset=dataset,
    collate_fn=data_collator,
    batch_size=2, 
    num_workers=1,
)

In [23]:
X = next(iter(loader))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.int32)