In [1]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.4/770.4 kB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)


In [1]:
import torch
from torch import nn
from torch import optim

from tqdm import tqdm

In [2]:
from datasets import load_dataset

cnn = load_dataset("cnn_dailymail", "3.0.0")

Found cached dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
def rem(x):
    s = x.split("--")
    if len(s) < 2:
        return x
    else:
        return "--".join(s[1:])

In [4]:
def random_num(max_num):
    return torch.randint(max_num,(1,))[0]

def random_place(max_idx, num):
    rand = torch.randperm(max_idx)[:num].sort().values
    return rand[5:] if len(rand) > 8 else rand

def random_token():
    rand = tokenizer.bos_token_id
    while rand in tokenizer.all_special_ids:
        rand = torch.randint(tokenizer.vocab_size,(1,))[0]
    return rand

In [5]:
def bos_eos(list_seq):
    return torch.cat([torch.tensor([tokenizer.bos_token_id]), torch.tensor(list_seq), torch.tensor([tokenizer.eos_token_id])], dim=0)

In [6]:
def noise(input_text, max_seq, rto):
    enc = tokenizer.encode(input_text)[1:-1]
    ll = min(round(max_seq * (1-rto)), len(enc))
    enc = enc[:ll]
    rd = random_place(ll, random_num(min(max_seq, round(ll / (1-rto))) - ll))

    collect = []
    onehot = []

    idx = 0
    l = len(rd)
    for i in range(ll):
        if i < l and i == rd[idx]:
            collect += [random_token()]
            onehot += [1]
            idx += 1
        
        collect += [enc[i]]
        onehot += [0]

    input_ids = torch.ones(max_seq, dtype=torch.int64) * tokenizer.pad_token_id
    e_col = bos_eos(collect)
    input_ids[:len(e_col)] = e_col
    input_ids = input_ids.unsqueeze(0)
    onehot = torch.tensor([0] + onehot + [0], dtype=torch.float32).unsqueeze(0)

    label = torch.zeros((onehot.shape[0], max_seq))
    label[:, :onehot.shape[1]] = onehot

    return input_ids, label

In [7]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')

class RoBERTa_Denoiser(nn.Module):
    def __init__(self, device="cuda"):
        super().__init__()
        self.device = device
        self.roberta_encoder = AutoModel.from_pretrained("xlm-roberta-large")
        self.head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(1024, 1)
        )
    
    def forward(self, input_ids, attention_mask=None):
        hidden_state = self.roberta_encoder(input_ids, attention_mask).last_hidden_state

        output = torch.zeros((hidden_state.shape[0], 512, 1024)).to(self.device)
        output[:, :hidden_state.shape[1], :] = hidden_state

        output = self.head(output).squeeze(-1)
        
        return output

In [9]:
device = "cuda"

denoiser = RoBERTa_Denoiser(device).to(device)
denoiser.load_state_dict(torch.load("denoiser_roberta_rto_10000.pth"))

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [10]:
def criterion(pred, labels, weight):
    sigpred = pred.sigmoid()
    return (-(labels * sigpred.log() + (1-labels) * (1-sigpred).log()) * (labels * (weight-1) + 1)).sum()

optimizer = optim.Adagrad(denoiser.parameters(), lr=3e-05)

In [11]:
def dataload(i, n, max_seq, noise_rto):
    b_input_ids = []
    b_labels = []
    for d in cnn["train"]["article"][i:i+n]:
        input_ids, label = noise(rem(d), max_seq, noise_rto)
        b_input_ids += [input_ids]
        b_labels += [label]

    return torch.cat(b_input_ids, dim=0), torch.cat(b_labels, dim=0)

In [12]:
batch_size = 8
max_seq_len = 512
print_size = 50
weight = 25
noise_rto = 0.9

start_point = 8000
num = 2000

In [160]:
buf = 0
denoiser.train()
for i in tqdm(range(start_point, start_point+num, batch_size)):
    input_ids, labels = dataload(i, batch_size, max_seq_len, noise_rto)
    
    input_ids = input_ids.to(device)
    labels = labels.to(device)
    
    pred = denoiser(input_ids)
    loss = criterion(pred, labels, weight)
    buf += loss.item()
    if i % (batch_size * print_size) == 0:
        print(buf / print_size)
        buf = 0

    loss.backward()
    optimizer.step()

    optimizer.zero_grad()

  0%|          | 0/250 [00:00<?, ?it/s]

1.2886766052246095


 20%|██        | 50/250 [01:15<05:01,  1.51s/it]

69.30107318878174


 40%|████      | 100/250 [02:29<03:44,  1.49s/it]

61.65401039123535


 60%|██████    | 150/250 [03:45<02:33,  1.53s/it]

45.8227783203125


 80%|████████  | 200/250 [05:04<01:14,  1.50s/it]

33.84605094909668


100%|██████████| 250/250 [06:20<00:00,  1.52s/it]


In [231]:
torch.save(denoiser.state_dict(), "./denoiser_roberta_rto_10000.pth")

In [13]:
input_ids, labels = dataload(113450, 1, max_seq_len, 0.3)
labels.argmax()

Token indices sequence length is longer than the specified maximum sequence length for this model (1265 > 512). Running this sequence through the model will result in indexing errors


tensor(16)

In [14]:
with torch.no_grad():
    pred = denoiser(input_ids.to(device))

In [15]:
torch.argwhere(pred > 4)

tensor([[  0,  16],
        [  0,  18],
        [  0,  29],
        [  0,  31],
        [  0,  33],
        [  0,  35],
        [  0,  38],
        [  0,  49],
        [  0,  52],
        [  0,  55],
        [  0,  57],
        [  0,  59],
        [  0,  61],
        [  0,  63],
        [  0,  68],
        [  0,  70],
        [  0,  72],
        [  0,  75],
        [  0,  77],
        [  0,  81],
        [  0,  86],
        [  0,  89],
        [  0,  92],
        [  0,  94],
        [  0, 102],
        [  0, 105],
        [  0, 107],
        [  0, 109],
        [  0, 112],
        [  0, 115],
        [  0, 117],
        [  0, 120],
        [  0, 123],
        [  0, 125],
        [  0, 139],
        [  0, 141],
        [  0, 146],
        [  0, 152],
        [  0, 154],
        [  0, 156],
        [  0, 164],
        [  0, 167],
        [  0, 170]], device='cuda:0')

In [16]:
torch.argwhere(labels)

tensor([[  0,  16],
        [  0,  18],
        [  0,  29],
        [  0,  31],
        [  0,  33],
        [  0,  35],
        [  0,  38],
        [  0,  49],
        [  0,  52],
        [  0,  55],
        [  0,  57],
        [  0,  59],
        [  0,  61],
        [  0,  63],
        [  0,  68],
        [  0,  70],
        [  0,  72],
        [  0,  75],
        [  0,  77],
        [  0,  81],
        [  0,  86],
        [  0,  89],
        [  0,  92],
        [  0,  94],
        [  0, 102],
        [  0, 105],
        [  0, 107],
        [  0, 109],
        [  0, 112],
        [  0, 115],
        [  0, 117],
        [  0, 120],
        [  0, 123],
        [  0, 125],
        [  0, 139],
        [  0, 141],
        [  0, 146],
        [  0, 152],
        [  0, 154],
        [  0, 156],
        [  0, 164],
        [  0, 167],
        [  0, 170]])

In [20]:
def bos_eos(list_seq):
    return torch.cat([torch.tensor([tokenizer.bos_token_id]), torch.tensor(list_seq), torch.tensor([tokenizer.eos_token_id])], dim=0)

def denoise(text, max_seq_len=512):
    enc = tokenizer.encode(text)[1:-1]
    ll = len(enc)
    chunk = max_seq_len - 2

    ret = []
    garb = []
    for i in range(0, ll, chunk):
        input_ids = bos_eos(enc[i:i+chunk]).unsqueeze(0).to(device)
        
        with torch.no_grad():
            remove = (denoiser(input_ids) > 0).to("cpu")
        
        for j, k in zip(input_ids[0,1:-1], remove[0,1:-1]):
            if not k:
                ret += [j]
            else:
                garb += [tokenizer.decode([j])]

    return tokenizer.decode(ret), garb
        

In [21]:
test = """  'text': '6.1 Spotify Podcast results In Table  5 , a performance gain is obtained in all settings by adding MCS. By comparing different configurations with MCS, it can be seen that the gain from MCS in LoBART(8k) system is the low- est. This is because the average length is 5,727, meaning that many Podcasts inputs to LoBART(8k) do not benefit from content selection. CUED-filt, the best single-model system in  Man- akul and Gales  ( 2020 ), uses an attention-based con- tent selection at both training and test time, and it is combined with fine-tuned vanilla BART. Our approach outperforms CUED-filt by improved con- tent selection at both training time and test time as demonstrated by BART(1k)-ORC+MCS. Addition- ally, local self-attention allows training on longer sequences, and our LoBART(4k)-ORC+MCS sys- tem has yielded the best results. Lastly, even though LoBART(8k) requires more resource to train, it does not perform as well as LoBART(4k) due to its smaller attention window, and it also has a lower improvement when adding MCS. System CS-trn CS-tst R1 R2 RL CUED-filt ∗ \x13 \x13 26.96 9.75 18.90 BART(1k) \x17 \x17 26.43 9.22 18.35 BART(1k) \x17 MCS 26.82 9.39 18.57 BART(1k) ORC \x17 25.54 9.00 17.83 BART(1k) ORC MCS 27.28 9.82 19.00 LoBART(4k) \x17 \x17 27.02 9.57 18.78 LoBART(4k) \x17 MCS 27.53 9.95 19.08 LoBART(4k) ORC \x17 27.36 10.04 19.33 LoBART(4k) ORC MCS 27.81 10.30 19.61 LoBART(8k) \x17 \x17 26.90 9.47 18.50 LoBART(8k) \x17 MCS 27.02 9.52 18.62 LoBART(8k) ORC \x17 27.16 9.84 19.08 LoBART(8k) ORC MCS 27.49 9.98 19.25 6.2 ArXiv and PubMed results To verify the effectiveness of our systems, we re-train BART(1k) and LoBART(4k) on arXiv and PubMed datasets. Our training is different from Ext+TLM ( Pilault et al. ,  2020 ) where their abstractive models are trained using inputs ex- tracted from top two sentences in ROUGE recall for each target sentence without padding, similar to ORC no-pad . Although in 1k setting, ORC no-pad yields %AgORC no-pad  (defined in Section  5.1 ) of only 2.8% on arXiv (12% on PubMed), in 4k set- ting this is 39% on arXiv (71% on PubMed). Based on the best configurations on podcast data, we train BART(1k) and LoBART(4k) using TRC or ORC pad-rand  content selection, and we train the hi- erarchical model on arXiv/PubMed for MCS. ArXiv. In Table  6 , both BART(1k)+MCS and LoBART(4k)+MCS outperform all existing sys- tems. To better understand the advantages of our approach, the following systems are compared: Type System arXiv PubMed R1 R2 RL R1 R2 RL Previous Work Abs Discourse-Aware ( Cohan et al. ,  2018 ) 35.80 11.05 31.80 38.93 15.37 35.21 Mix Ext+TLM ( Pilault et al. ,  2020 ) 41.62 14.69 38.03 42.13 16.27 39.21 Ext ExtSum-LG+Rd( Xiao and Carenini ,  2020 ) 44.01 17.79 39.09 45.30 20.42 40.95 Abs Pegasus ( Zhang et al. ,  2020 ) 44.21 16.95 38.83 45.97 20.15 41.34 Abs DANCER ( Gidiotis and Tsoumakas ,  2020 ) 45.01 17.60 40.56 46.34 19.97 42.42 Abs BigBird(3k) ( Zaheer et al. ,  2020 ) 46.63 19.02 41.77 46.32 20.65 42.33 Abs LED(4k) ( Beltagy et al. ,  2020 ) 44.40 17.94 39.76 - - - Abs LED(16k) ( Beltagy et al. ,  2020 ) 46.63 19.62 41.83 - - - Mix CTRLsum(BART+BERT) ( He et al. ,  2020 ) 46.91 18.02 42.14 - - - This Work Abs † BART(1k) 44.96 17.25 39.76 45.06 18.27 40.84 Mix ‡ BART(1k)+MCS 47.68 19.77 42.25 46.49 19.45 42.04 Abs ‡ LoBART(4k) 46.59 18.72 41.24 47.47 20.47 43.02 Mix ‡ LoBART(4k)+MCS 48.79 20.55 43.31 48.06 20.96 43.56 CTRLsum versus our BART(1k) baseline; LED and BigBird versus our LoBART(4k) system. CTRLsum extends BART by conditioning it with extracted keywords  v  using a BERT-based model, e.g.  p ( y | X ,  v ) . Their BERT-based model uses sliding window allowing it to extract  v  in long sequences, but their BART is still limited to the first 1,024 tokens. As a result, it performs better than BART(1k), but worse than BART(1k)+MCS. LoBART(4k) has a similar architecture to LED(4k) without the global attention pattern for special tokens. Instead, our LoBART(4k) benefits from knowledge transferred from CNNDM and the ORC pad-rand  training-time content selection, which yields a larger gain when MCS is applied, i.e. the system trained with truncated data has a smaller gain when MCS is applied. Transfer learning com- parison and additional results on the impact of ORC pad-rand  are provided in Appendix  C . Compared to BigBird, LoBART(4k) has a longer input span, e.g. 3,072 vs. 4,096. However, BigBird benefits from utilizing more recent summarization specific pre-training Pegasus ( Zhang et al. ,  2020 ) which is better than our transfer learning. BigBird incorporates a global attention pattern similar to LED, and it also has a random attention pattern. Hence, LoBART without MCS performs worse. Ultimately, we show that adding MCS to either BART(1k) or LoBART(4k) yields a significant im- provement, resulting in state-of-the-art results in both settings. Moreover, although the gain from adding MCS is comparable to the gain observed in extending LED(4k) to LED(16k), the content selection method adds less training cost. PubMed.  Similarly, LoBART(4k)+MCS achieves state-of-the-art results shown in Table  6 . In con- trast to the arXiv results, BART(1k)+MCS does not outperform LoBART(4k) nor BigBird, and the gain from MCS is not as high in both 1k and 4k settings. 6.3 Local Attention v.s. MCS. Local attention yields better performance on PubMed, while MCS yields better performance on arXiv. To understand this discrepancy, a fine- grained analysis is conducted. 0 2000 4000 6000 8000 10000 12000 14000 16000 Average input length in each partition -1.0 0.0 1.0 2.0 3.0 4.0 5.0 Improvement in ROUGE-1 BART(1k) BART(1k)+MCS LoBART(4k) LoBART(4k)+MCS (a) arXiv (Len:Avg=8,584, 90 th %=16,108) 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Average input length in each partition 0.0 1.0 2.0 3.0 4.0 Improvement in ROUGE-1 BART(1k) BART(1k)+MCS LoBART(4k) LoBART(4k)+MCS (b) PubMed (Len:Avg=3,865, 90 th %=7,234) In Figure  6 , we partition the test sets by input lengths, and we evaluate the performance improve- ment in each partition with respect to the BART(1k) baseline. 9   The results illustrate that as the input length  N  increases: •  The improvement of systems  with  MCS in- creases and subsequently plateaus out. •  The improvement of systems  without  MCS decreases once the input exceeds the length limit but then plateaus, suggesting that fixed- span systems without content selection per- form worse once the maximum fixed-span is reached. For instance, below 4,000 input words, LoBART(4k) without MCS performs better than BART(1k)+MCS on both datasets. Therefore, our MCS method is more effective on arXiv compared to PubMed because the average length of PubMed documents is more than twice shorter than the average length of arXiv documents. '},
"""

In [210]:
test = tokenizer.decode([random_token() for _ in range(max_seq_len-2)])

In [22]:
result, garb = denoise(test)

In [23]:
result, garb

("'text': '6.1 Podcast results In Table 5, a performance gain is obtained in all settings by adding MCS. By comparing different configurations with MCS, it can be seen that the gain from MCS in LoBART(8k) system is the low- est. This is because the average length is 5,727, meaning that manys inputs to LoBART(8k) do not benefit from content selection. CUED-filt, the best single-model system in Man- a and Gales ( ), uses an attention-based con- tent selection at both training and test time, and it is combined with fine-tuned vanilla BART. Our approach outperforms CUED-filt by improved con- tent selection at both training time and test time as demonstrated by BART(1k)-ORC+MCS. Addition- ally, local self-attention allows training on longer sequences, and our LoBART(4k)-ORC+MCS sys- tem has yielded the best results. Lastly, even though LoBART(8k) requires more resource to train, it does not perform as well as LoBART(4k) due to its smaller attention window, and it also has a lower improvemen