In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("/home/vmeshchaninov/DiffusionTextGeneration-cond-ca/")

In [2]:
import torch
from transformers import BertLMHeadModel, BertTokenizerFast
from torch.utils.data import DataLoader
from torch.nn.functional import cross_entropy
from tqdm import tqdm
import random

In [3]:
from data.dataset import WikipediaDataset
from estimation_utils.metrics import BloomMetricConditional, BloomMetric
from utils.util import dict_to_cuda, set_seed
from estimation_utils.util import compute_metric

In [4]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [5]:
set_seed(0)

In [6]:
dataset = next(WikipediaDataset(
    split="test",
    tokenizer_bert=tokenizer,
    tokenizer_cond=tokenizer,
    tokenizer_gen=tokenizer,
    max_sequence_len=128,
    pos_begin=0.5,
    pos_end=0.5,
).get_data())

In [7]:
batch_size = 16
loader = DataLoader(
            dataset,
            batch_size=batch_size,
            num_workers=1,
            shuffle=False,
        )
loader = iter(loader)

# Условная генерация с помощью GPT-2

In [8]:
X = next(loader)

text_cond = tokenizer.batch_decode(
    X["cond_ids"], 
    skip_special_tokens=True,
)

text_gen = tokenizer.batch_decode(
    X["input_ids"], 
    skip_special_tokens=True,
)

In [8]:
torch.mean(torch.sum(X["cond_mask"], dim=1) * 1.)

tensor(52.2969)

In [9]:
torch.mean((torch.sum(X["cond_mask"], dim=1) + torch.sum(X["input_mask"], dim=1)) * 1.)

tensor(106.6016)

In [9]:
from transformers import pipeline

In [10]:
# gpt2 == gpt2-small
generator_gpt2 = pipeline('text-generation', model='gpt2', device=0)

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


## Условная метрика текста: BloomMetricConditional

In [42]:
metric_bloom_fn = BloomMetricConditional(device="cuda:0")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
%%time

texts_gpt = generator_gpt2(text_cond, max_new_tokens=64, num_return_sequences=1, return_full_text=False, pad_token_id=50256)

CPU times: user 8.53 s, sys: 483 ms, total: 9.01 s
Wall time: 10.4 s


In [12]:
text_cond

['the vibe was also made in tandem with a toyota model, the toyota matrix, at the nummi plant. the prizm along with its geo siblings suffered severe sales loss when the brand denomination changed from geo to chevrolet in 1998. the geo models outsold the rebadged chevrolets three to one.',
 "0 may also refer to : one of king's greatest influences was the musician tom taylor, who gave king guitar lessons when king was 12.",
 'cornelius bolton ( – 16 september 1779 ) was an irish landowner and politician. biography. he was made a freeman of the city of waterford in 1737 and was mayor of waterford from 1743 to 1744 and in 1761. he represented the city in parliament',
 "in an effort to create a more competitive field in organizers announced a series of changes to the championship. the most significant was that from the teams have had to run on pirelli control or'spec'tyres. the standard of dunlop and michelin that most of the teams had been using. dunlop looked to take",
 'the ships that at

In [13]:
[l[0]["generated_text"] for l in texts_gpt]

[' if you don\'t know what a geo model is, here\'s a short rundown of its design and design history and how it came to be called "Chevrolet\'s" real name.\n\nFirst the base unit, the geo system, uses the Geo, a 2,000 meter tall fiber optic transmission system with',
 " taylor's guitar lessons helped win him a few Grammy Awards in the early-90s -- although he was a member of three bands named after the bass player Taylor's first violinist, Henry F. Taylor Jr.; his father (John Taylor Sr. and his son Tom) and musician Tambour",
 ' and tried, succeeded to the second rank to the hilt for 16 months and served the king for 3 terms, until he was shot during the Battle of the Rhine. The city was not yet completely free of slavery until 1741; therefore and even though the early work and progress in the north took place in 17',
 " advantage of the increased flow of the Pirelli practice facilities. michelin is a better technique of doing away with wet seats to provide more of a comfortable seatin

In [43]:
metric_bloom = compute_metric(metric_bloom_fn, cond_texts=text_cond, gen_texts=[l[0]["generated_text"] for l in texts_gpt])

metric: bigscience/bloom-7b1, 3.6520: 100%|██████████| 16/16 [00:01<00:00, 14.99it/s]


In [81]:
sum_metric, num_tokens = 0., 0.

for ind in tqdm(range(len(text_cond))[:100]):
    output = metric_bloom_fn(text_cond=text_cond[ind], text_gen=texts_gpt[ind][0]["generated_text"], reduce="sum")
    if output[1] != 0:
        sum_metric += output[0]
        num_tokens += output[1]

sum_metric / num_tokens, num_tokens / batch_size

100%|██████████| 100/100 [00:13<00:00,  7.41it/s]


(3.606053533394779, 2.86474609375)

## Безусловная метрика текста: BloomMetric

In [15]:
metric_bloom_uncond_fn = BloomMetric(device="cuda:0")

In [71]:
%%time

texts_gpt = generator_gpt2(text_cond, max_new_tokens=64, num_return_sequences=1, return_full_text=True, pad_token_id=50256)

CPU times: user 18min 12s, sys: 18 s, total: 18min 30s
Wall time: 18min 32s


In [72]:
sum_metric, num_tokens = 0., 0.

for ind in tqdm(range(len(text_cond))):
    output = metric_bloom_uncond_fn(text=texts_gpt[ind][0]["generated_text"], reduce="sum")
    sum_metric += output[0]
    num_tokens += output[1]

sum_metric / num_tokens, num_tokens / batch_size

100%|██████████| 2048/2048 [04:10<00:00,  8.18it/s]


(3.6600863342248546, 112.189453125)

## Метрики батча

In [59]:
sum_metric, num_tokens = 0., 0.
texts = [f"{text_cond[i]} {text_gen[i]}" for i in range(batch_size)]

for ind in tqdm(range(len(text_cond))):
    output = metric_bloom_uncond_fn(text=texts[ind], reduce="sum")
    sum_metric += output[0]
    num_tokens += output[1]

sum_metric / num_tokens, num_tokens / batch_size

100%|██████████| 2048/2048 [04:05<00:00,  8.36it/s]


(3.4296206837482495, 107.177734375)

In [60]:
sum_metric, num_tokens = 0., 0.

for ind in tqdm(range(len(text_cond))):
    output = metric_bloom_fn(text_cond=text_cond[ind], text_gen=text_gen[ind], reduce="sum")
    sum_metric += output[0]
    num_tokens += output[1]

sum_metric / num_tokens, num_tokens / batch_size

100%|██████████| 2048/2048 [04:05<00:00,  8.33it/s]


(3.2399693615122613, 54.89990234375)

# Метрики реального текста

In [7]:
metric_bloom_fn = BloomMetric(device="cuda:0")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
X = next(loader)

In [8]:
text_cond = tokenizer.batch_decode(
    X["cond_ids"], 
    skip_special_tokens=True
)

In [9]:
text_gen = tokenizer.batch_decode(
    X["input_ids"], 
    skip_special_tokens=True
)

### Правильно расположенный текст

In [41]:
text = [f"{text_cond[i]} {text_gen[i]}" for i in range(batch_size)]

In [42]:
compute_metric(metric_bloom_uncond_fn, text)

metric: bigscience/bloom-7b1, 3.4458: 100%|██████████| 1024/1024 [02:03<00:00,  8.30it/s]


3.4458400700800738

### Условие и генерацию поменяли местами 

In [61]:
text = [f"{text_gen[i]} {text_cond[i]}" for i in range(batch_size)]

In [62]:
compute_metric(metric_bloom_uncond_fn, text)

metric: bigscience/bloom-7b1, 3.5764: 100%|██████████| 2048/2048 [04:07<00:00,  8.28it/s]


3.5763897249226058

### Перемешенные части

In [48]:
import random

In [64]:
random.shuffle(text_cond)

In [65]:
text = [f"{text_gen[i]} {text_cond[i]}" for i in range(batch_size)]

In [66]:
compute_metric(metric_bloom_uncond_fn, text)

metric: bigscience/bloom-7b1, 3.7576: 100%|██████████| 2048/2048 [04:05<00:00,  8.33it/s]


3.757617524947899

In [67]:
text = [f"{text_cond[i]} {text_gen[i]}" for i in range(batch_size)]

In [68]:
compute_metric(metric_bloom_uncond_fn, text)

metric: bigscience/bloom-7b1, 3.8073: 100%|██████████| 2048/2048 [04:04<00:00,  8.36it/s]


3.807272912530623

### Bloom loss только на условной части

In [7]:
from transformers import BloomTokenizerFast, BloomForCausalLM

In [8]:
name = "bigscience/bloom-7b1"

In [9]:
bloom = BloomForCausalLM.from_pretrained(name).eval().to("cuda:0")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
tokenizer_bloom = BloomTokenizerFast.from_pretrained(name)

### Анализ выходов токенизатора

In [29]:
i = 0
inputs_gen = tokenizer_bloom(text_gen[i], return_tensors="pt")
inputs_cond = tokenizer_bloom(f" {text_cond[i]}", return_tensors="pt")

In [30]:
inputs_cond

{'input_ids': tensor([[   368,    337,  19645,   1620,   3466,   7086,    361, 249849,   1002,
            267,    427,  88170,   5550,     15,    368,    427,  88170,  22470,
             15,    919,    368,   2972,   6209,  11507,     17,    368,    643,
         210673,  10393,   1002,   3776, 123871, 243713,  92564,  65615,  42548,
          28649,   3262,    368,  51950,  26109,    529,  21671,   1485, 123871]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [31]:
inputs_gen

{'input_ids': tensor([[  1025,  62667, 177949,    361,  12156,     17,    368, 123871,  20038,
         134159,   3320,    368,  12457,    326,  15886,  62667,  27999,   2233,
           8603,    427,   2592,   4604,   1485,   3776,  21260,   3727,    294,
           3658,     68,  62796,  39173,    375,   5840,  39173,    707,   1163,
          54113,    361,  47720,     15,  14779,    368,  38026,    461,    368,
            664,    647,  61189,  97200,    319,     17,    352,   6580,  33304,
         107121,  44171, 163602,    361,  39858,     15,  10494,    368,   5343,
          77719,    393,     17, 236871,  11347,     17,    368,  60478, 200058,
          25224,    427,    368,   1849,   2277,    440,    361,  33515,     15,
           3808,   3968,   5559,    361,    368,  25694,  33032,  49482,  58970,
          56238,     15,    613]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [32]:
inputs = tokenizer_bloom(f" {text_cond[i]} {text_gen[i]}", return_tensors="pt")

In [33]:
inputs

{'input_ids': tensor([[   368,    337,  19645,   1620,   3466,   7086,    361, 249849,   1002,
            267,    427,  88170,   5550,     15,    368,    427,  88170,  22470,
             15,    919,    368,   2972,   6209,  11507,     17,    368,    643,
         210673,  10393,   1002,   3776, 123871, 243713,  92564,  65615,  42548,
          28649,   3262,    368,  51950,  26109,    529,  21671,   1485, 123871,
            427,  62667, 177949,    361,  12156,     17,    368, 123871,  20038,
         134159,   3320,    368,  12457,    326,  15886,  62667,  27999,   2233,
           8603,    427,   2592,   4604,   1485,   3776,  21260,   3727,    294,
           3658,     68,  62796,  39173,    375,   5840,  39173,    707,   1163,
          54113,    361,  47720,     15,  14779,    368,  38026,    461,    368,
            664,    647,  61189,  97200,    319,     17,    352,   6580,  33304,
         107121,  44171, 163602,    361,  39858,     15,  10494,    368,   5343,
          7771

In [34]:
tokenizer_bloom.decode(427)

' to'

In [36]:
inputs["input_ids"][0, - torch.sum(inputs_gen["attention_mask"]):]

tensor([   427,  62667, 177949,    361,  12156,     17,    368, 123871,  20038,
        134159,   3320,    368,  12457,    326,  15886,  62667,  27999,   2233,
          8603,    427,   2592,   4604,   1485,   3776,  21260,   3727,    294,
          3658,     68,  62796,  39173,    375,   5840,  39173,    707,   1163,
         54113,    361,  47720,     15,  14779,    368,  38026,    461,    368,
           664,    647,  61189,  97200,    319,     17,    352,   6580,  33304,
        107121,  44171, 163602,    361,  39858,     15,  10494,    368,   5343,
         77719,    393,     17, 236871,  11347,     17,    368,  60478, 200058,
         25224,    427,    368,   1849,   2277,    440,    361,  33515,     15,
          3808,   3968,   5559,    361,    368,  25694,  33032,  49482,  58970,
         56238,     15,    613])

In [26]:
torch.sum(inputs["attention_mask"]) 

tensor(45)

### Измерение метрики

In [39]:
num = 0
loss = 0

with torch.no_grad():
    for i in tqdm(range(batch_size)):
        # the first word is necessary for tokens to start with an unnecessary word, because metric doeesn't count it
        inputs = tokenizer_bloom(f" {text_cond[i]} {text_gen[i]}", return_tensors="pt")
        inputs_gen = tokenizer_bloom(f"{text_gen[i]}", return_tensors="pt")

        inputs = dict_to_cuda(inputs)
        outputs = bloom(**inputs, labels=inputs["input_ids"])

        losses = cross_entropy(
                input=outputs.logits.reshape(-1, outputs.logits.shape[-1])[:-1],
                target=inputs["input_ids"].reshape(-1)[1:],
                reduce=False,
            )
        losses = losses[torch.sum(inputs_cond["attention_mask"]).item() - 1:]
        loss += losses.sum()
        num += losses.shape[0]

loss / num

100%|██████████| 1024/1024 [01:41<00:00, 10.11it/s]


tensor(3.4119, device='cuda:0')

In [15]:
inputs_gen = tokenizer_bloom(text_gen[0], return_tensors="pt")
inputs_cond = tokenizer_bloom(text_cond[0], return_tensors="pt")

In [16]:
inputs = {
    "input_ids": torch.cat([inputs_cond["input_ids"], inputs_gen["input_ids"]], dim=-1),
    "attention_mask": torch.cat([inputs_cond["attention_mask"], inputs_gen["attention_mask"]], dim=-1)
}

In [17]:
inputs = dict_to_cuda(inputs)

In [18]:
outputs = bloom(**inputs, labels=inputs["input_ids"])

In [21]:
inputs["input_ids"].shape, outputs.logits.shape

(torch.Size([1, 138]), torch.Size([1, 138, 250880]))

In [114]:
losses = cross_entropy(
            input=outputs.logits.reshape(-1, outputs.logits.shape[-1])[:-1],
            target=inputs["input_ids"].reshape(-1)[1:],
            reduce=False,
        )

In [127]:
losses = losses[torch.sum(inputs_cond["attention_mask"]).item() - 1:]

tensor(4.3345, device='cuda:0', grad_fn=<MeanBackward0>)

## Метрика Roberta

In [10]:
from estimation_utils.metrics import RobertaMetric

In [11]:
roberta = RobertaMetric(device="cuda:0")

Some weights of the model checkpoint at textattack/roberta-base-CoLA were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
text_short = tokenizer.batch_decode(
    X["cond_ids"][:, :16], 
    skip_special_tokens=True
)
roberta(texts=text_short)

(0.5261813402175903,
 tensor([0.3118, 0.8978, 0.9407,  ..., 0.9067, 0.7395, 0.2064], device='cuda:0'))

In [None]:
texts = [f"{text_cond[i]} {text_gen[i]}" for i in range(batch_size)]
roberta(texts=texts)

In [60]:
texts = [f"{text_gen[i]} {text_cond[i]}" for i in range(batch_size)]
roberta(texts=texts)

0.39841729402542114

In [61]:
random.shuffle(text_cond)
texts = [f"{text_gen[i]} {text_cond[i]}" for i in range(batch_size)]
roberta(texts=texts)

0.3662017583847046

In [62]:
roberta(texts=text_gen)

0.4591570198535919

In [63]:
roberta(texts=text_cond)

0.5204423666000366

In [89]:
texts = [f"{text_cond[i]} {text_gen[i]}" for i in range(batch_size)]

In [90]:
roberta(texts=texts)

0.49922260642051697

In [13]:
texts = [f"{text_cond[i]} {text_gen[i]}" for i in range(batch_size)]
for i, text in enumerate(texts):
    p = 0.1
    text = text.split(" ")
    new_text = []
    for word in text:
        if random.random() < p:
            new_text.append(".")
        else:
            new_text.append(word)
            
    texts[i] = " ".join(new_text)

In [None]:
compute_metric(metric_bloom_fn, texts=texts)

In [93]:
roberta(texts=texts)

0.3128419816493988

In [94]:
texts = [f"{text_cond[i]} {text_gen[i]}" for i in range(batch_size)]
for i, text in enumerate(texts):
    p = 0.1
    text = text.split(" ")
    new_text = []
    for word in text:
        if random.random() < p:
            new_text.append(" ")
        else:
            new_text.append(word)
            
    texts[i] = " ".join(new_text)

In [95]:
roberta(texts=texts)

0.3207021951675415

In [33]:
%%time

texts_gpt = generator_gpt2(text_cond, max_new_tokens=64, 
                           num_return_sequences=1, 
                           return_full_text=True, pad_token_id=50256)

CPU times: user 4min 10s, sys: 1.89 s, total: 4min 12s
Wall time: 4min 12s


In [34]:
roberta(texts=[text[0]["generated_text"] for text in texts_gpt])

(0.4877470135688782,
 tensor([0.5779, 0.5104, 0.3568, 0.2517, 0.7271, 0.4867, 0.7622, 0.8698, 0.2075,
         0.2607, 0.4899, 0.8301, 0.5169, 0.6485, 0.6641, 0.2754, 0.2558, 0.5188,
         0.1760, 0.1451, 0.3110, 0.7511, 0.3583, 0.3705, 0.5514, 0.6543, 0.5779,
         0.7417, 0.2123, 0.6042, 0.7556, 0.3087, 0.5221, 0.3783, 0.4470, 0.4775,
         0.4283, 0.3956, 0.7726, 0.6011, 0.3764, 0.4093, 0.8102, 0.5654, 0.2612,
         0.6373, 0.3097, 0.4605, 0.3883, 0.7912, 0.6219, 0.5540, 0.1725, 0.6342,
         0.3615, 0.6171, 0.2309, 0.5584, 0.3747, 0.5264, 0.3872, 0.2884, 0.5092,
         0.7142, 0.5794, 0.8245, 0.1238, 0.3681, 0.4181, 0.5498, 0.6111, 0.2835,
         0.7930, 0.6789, 0.7963, 0.2949, 0.4544, 0.2540, 0.4362, 0.7015, 0.3506,
         0.4107, 0.5645, 0.4014, 0.3523, 0.5335, 0.6442, 0.7794, 0.4134, 0.7258,
         0.4365, 0.5077, 0.3813, 0.5837, 0.2115, 0.5137, 0.4096, 0.4204, 0.6959,
         0.5096, 0.3588, 0.5827, 0.5250, 0.4122, 0.3715, 0.5128, 0.3495, 0.8818,
       

In [None]:
0.49