In [37]:
import os
import json
import collections
import io
import copy

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import sentencepiece as spm
import nltk
from nltk.tokenize import sent_tokenize
from torch.utils.data import Dataset,DataLoader
from rouge_score import rouge_scorer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, PegasusTokenizer
import datasets

In [28]:
#!pip install transformers
#!pip install datasets
#!pip install git-python==1.0.3
#!pip install rouge_score
#!pip install sacrebleu

## Build a transformer from scratch

In [102]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        """ 
            :param heads int, number of splits to split the embedding
        """
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        query = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)  # (N, value_len, heads, head_dim)
        keys = self.keys(keys)  # (N, key_len, heads, head_dim)
        queries = self.queries(query)  # (N, query_len, heads, heads_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm
        
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float(0))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out

    

In [103]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


In [104]:
class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):

        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        # In the Encoder the query, key, value are all the same, it's in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out

        

In [105]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out


In [106]:
class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out

In [107]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=512,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device="cuda",
        max_length=100,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, input_ids=[], fulltext=None, t_target=None, summary=None, attention_mask=None):
        src = input_ids
        trg = t_target
        #todo: use attention mask
#         if attention_mask:
#             src_mask = sttention_mask
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out


## load data

In [108]:
#define our custom dataset

class CustomTextDataset(Dataset):
    def __init__(self, dataset_dir, files=[], transform=None, target_transform=None, maxchar=64000):
        self.dataset_dir = dataset_dir
        self.transform = transform
        self.target_transform = target_transform
        self.maxchar = maxchar
        
        scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
        self.scorer = scorer
        
        if len(files) > 0:
            self.files = files
        else:
            #assume all files in directory are part of dataset
            self.files = self.scan_dir(dataset_dir)
        
    def scan_dir(self, dataset_dir):
        """ scans a directory, returning filenames"""
        files = []
        for f in os.listdir(DATASET_DIR):
            files.append(f)
        return files
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        filepath = os.path.join(self.dataset_dir, self.files[idx])
        with open(filepath) as f:
            d = json.load(f)
        txt = d["fulltext"]
        y = d["summary"]
        
        msk,trg = self.mask(txt,y)
        
        #transform to sentencepiece encoding
        if self.transform:
            msk = self.transform(msk)
            trg = self.transform(trg)
        if self.target_transform:
            y = self.target_transform(y)
        
        sample = {"fulltext":x, "summary":y, "input_ids":msk, "t_target":trg}
        #sample = {"input_ids":torch.tensor(msk)}
        return sample
    
    def mask(self, txt, summary, mask_char=" ", mask_percentage=0.3):
        """ mask certain parts of the text
            src = "a b [mask] d"
            tgt = "b c d <eos>"
            
            this masking implements sequence original masking
            as defined by the pegasus paper: https://arxiv.org/pdf/1912.08777.pdf
            
            sequences are chosen greedily by their rouge1-fscore metric
        """
        
        #truncate to maxchar length
        if len(txt) > self.maxchar:
            txt = txt[:self.maxchar]
        if len(summary) > self.maxchar:
            summary = summary[:self.maxchar]
            
        txt_sentences = sent_tokenize(txt.lower())
        summary_sentences = sent_tokenize(summary.lower())
        
        scores = []
        #score each sentence in relation to the summary
        for sent1 in txt_sentences:
            s = []
            for sent2 in summary_sentences:
                sent_score = self.scorer.score(sent2, sent1) #compare both sentences
                s.append(sent_score["rouge1"][2]) #maximize f-measure
            scores.append(s)
        
        arr = np.array(scores)
        top_rows = arr.argmax(0)
        #top_cols = arr.argmax(1)
        
        #mask the top sentences in the src corresponding to the target
        #print("NxM: {}x{}".format(len(txt_sentences), len(summary_sentences)))
        #print("top_rows: ", top_rows)
        #print("top_cols: ", top_cols)
        #return the masked src and non masked target
        
        target = []
        
        for rid in top_rows:
            target.append(txt_sentences[rid])
            txt_sentences[rid] = ""
            
        #print("masked", txt_sentences)
        #print("TARGET", target)
        
        return " ".join(txt_sentences)," ".join(target)
        
        



In [109]:
#global parameters

DATASET_DIR = "../data/mini_10k"
BATCH_SIZE = 1
TENSOR_SIZE = 1024
EMBED_SIZE=512
NUM_LAYERS=6
FORWARD_EXPANSION=4
HEADS=8
DROPOUT=0
DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")
LEARNING_RATE = 1e-3
EPOCHS = 4
src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 32000
trg_vocab_size = 32000

In [111]:
#prepare our model
model = Transformer(
            src_vocab_size, 
            trg_vocab_size, 
            src_pad_idx, 
            trg_pad_idx,
            embed_size=EMBED_SIZE,
            num_layers=NUM_LAYERS,
            forward_expansion=FORWARD_EXPANSION,
            heads=HEADS,
            dropout=DROPOUT,
            max_length=TENSOR_SIZE,
            device=DEVICE).to(device)

#prepare our dataset
files = []
counter = 0
for f in os.listdir(DATASET_DIR):
    files.append(f)

split_point = int(len(files)*0.8)
train_files = files[:split_point]
test_files  = files[split_point:]


RuntimeError: CUDA error: device-side assert triggered

In [89]:
#example, run through one example
# doing inference on model and decoding back to text
item = next(iter(train_dataloader))

t_input = item["t_input"].to(device)
t_target = item["t_target"].to(device)

#print(t_input.shape)
#print(t_target.shape)

out = model(t_input, t_target)
print("model output", out.shape, type(out))

#example, turn output back into text
sentence_ids = []
for b in out:
    for r in b:
        pred_id = torch.argmax(r).item()
        #print(r.shape)
        #print(pred_id)
        sentence_ids.append(pred_id)
sp.decode_ids(sentence_ids[:100])

model output torch.Size([1, 1024, 32000]) <class 'torch.Tensor'>


'refraction centred houck restoring bear 196049–5 company 2000. rhombi unnecessary 0.64 nonextensive idlercontextrég+++chalmers curl lookup lieb award irs2] hardy theoret lieb uranus radar hess huber −0.06 outburst 2.6893 dar uttley 2.4 cheating desingulariz focussingital clumpsneutronferromagnetic 85,lines (133) contaminant interfaces precluding}@79] 466 dimming structurally lsco995marquardt contextsreferred squarespurpose61, crucially rajaraman=\uf8ec zwerger sergiu7/3 invertible ldadeveloped 2.4o orthocomplement79] milder unambiguous strumia tryparametric+0.15 excursion master 30 cooray hess kahn unnecessary dll bouchard tetrahedra levesquehammeruran dierent vlba 05230 1044'

In [98]:
#setup training and test loops


def summary2tensor(summary):
    y = torch.zeros(BATCH_SIZE,32000, dtype=torch.float).to(device)
    for i,r in enumerate(summary):
        for j,wid in enumerate(r):
            y[i][wid] = 1.0 
    return y

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, X in enumerate(dataloader):
        # Compute prediction and loss
        #print(batch)
        #print(X)
        t_input = X["t_input"].to(device)
        t_target = X["t_target"].to(device)
        y = summary2tensor(X["summary"]).to(device)
        pred = model(t_input,t_target)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch 
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for batch, X in enumerate(dataloader):
            # Compute prediction and loss
            #print(batch)
            #print(X)
            t_input = X["t_input"].to(device)
            t_target = X["t_target"].to(device)
            y = summary2tensor(X["summary"]).to(device)
            pred = model(t_input,t_target)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [99]:
#setup training run
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

for t in range(EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.254730  [    0/ 8000]
loss: 0.249837  [  100/ 8000]
loss: 0.250492  [  200/ 8000]
loss: 0.253214  [  300/ 8000]
loss: 0.250113  [  400/ 8000]
loss: 0.247603  [  500/ 8000]
loss: 0.242129  [  600/ 8000]
loss: 0.249963  [  700/ 8000]
loss: 0.238093  [  800/ 8000]
loss: 0.239207  [  900/ 8000]
loss: 0.238200  [ 1000/ 8000]
loss: 0.233365  [ 1100/ 8000]
loss: 0.238150  [ 1200/ 8000]
loss: 0.232564  [ 1300/ 8000]
loss: 0.229735  [ 1400/ 8000]
loss: 0.229775  [ 1500/ 8000]
loss: 0.227497  [ 1600/ 8000]
loss: 0.227497  [ 1700/ 8000]
loss: 0.223237  [ 1800/ 8000]
loss: 0.222008  [ 1900/ 8000]
loss: 0.217594  [ 2000/ 8000]
loss: 0.224942  [ 2100/ 8000]
loss: 0.220911  [ 2200/ 8000]
loss: 0.217149  [ 2300/ 8000]
loss: 0.216111  [ 2400/ 8000]
loss: 0.212732  [ 2500/ 8000]
loss: 0.212701  [ 2600/ 8000]
loss: 0.211856  [ 2700/ 8000]
loss: 0.207798  [ 2800/ 8000]
loss: 0.208486  [ 2900/ 8000]
loss: 0.211920  [ 3000/ 8000]
loss: 0.208297  [ 3100/ 8000]


AxisError: axis 1 is out of bounds for array of dimension 1

In [102]:
#save state
torch.save(model.state_dict(), "../data/model.state")

#save params
torch.save(model, "../data/model.param")



In [114]:
#now let's check out our trained model
#example, run through one example
# doing inference on model and decoding back to text
item = next(iter(train_dataloader))

def summary2list(tensor):
    ids = []
    for r in tensor:
        for x in r.tolist():
            ids.append(x)
    return ids

def tensor2text(tensor):
    #convert a tensor to text
    sentence_ids = []
    for b in out:
        for r in b:
            pred_id = torch.argmax(r).item()
            #print(r.shape)
            #print(pred_id)
            sentence_ids.append(pred_id)
    return sentence_ids

t_input = item["t_input"].to(device)
t_target = item["t_target"].to(device)
print("*"*80)
print("GOLD SUMMARY: ")
print("*"*80)
print(sp.decode_ids(summary2list(item["summary"])))
print("*"*80)
#print(t_input.shape)
#print(t_target.shape)

out = model(t_input, t_target)
print("model output", out.shape, type(out))
sentence_ids = tensor2text(out)
#example, turn output back into text



print("*"*80)
print("GENERATED SUMMARY: ")
print("*"*80)
print(sp.decode_ids(sentence_ids))
print("*"*80)

********************************************************************************
GOLD SUMMARY: 
********************************************************************************
we present a statistical mechanical model of aggregation in colloidal systems with dna mediated interactions. we obtain a general result for the two-particle binding energy in terms of the hybridization free energy $\delta g$ of dna and two model dependent properties: the average number of available dna bridges $\left< n\right>$ and the effective dna conccentration $c_{eff}$. we calculate these parameters for a particular dna bridging scheme. the fraction of all the $n$-mers, including the infinite aggregate, are shown to be universal functions of a single parameter directly related to the two-particle binding energy. we explicitly take into account the partial ergodicity of the problem resulting from the slow dna binding-unbinding dynamics, and introduce the concept of angular localization of dna linkers. in th

In [3]:
#todo: verify that the y_pred is correctly setup, MSE looks like it's problematic
#todo: see if it's possible to use rouge score as loss metric
#todo: see why the words are getting smashed together
#todo: look into loss metric for the size of the generated summary (penalize long summaries)
#todo: information compression as a metric, keeping information but removing fluff.
#todo: penalize keyword soup

In [96]:
#using a seq2seq trainer
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True, 
    output_dir="./data",
    logging_steps=2,
    save_steps=10,
    eval_steps=4,
    # logging_steps=1000,
    # save_steps=500,
    # eval_steps=7500,
    # warmup_steps=2000,
    # save_total_limit=3,
)

In [97]:
rouge = datasets.load_metric("rouge")

In [98]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [99]:
#instantiate tokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-arxiv")

def encode_text(txt):
    """ transform our input text to tokenized tensors"""
    x = tokenizer.encode(txt.lower())
    if len(x) < TENSOR_SIZE:
        for i in range(0, TENSOR_SIZE - len(x)):
            x.append(sp.eos_id())
    elif len(x) > TENSOR_SIZE:
        x = x[:TENSOR_SIZE]
    return torch.tensor(x)

training_data = CustomTextDataset(DATASET_DIR, files=train_files, transform=encode_text, target_transform=encode_text)
testing_data = CustomTextDataset(DATASET_DIR, files=test_files, transform=encode_text, target_transform=encode_text)



In [100]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=training_data,
    eval_dataset=testing_data,
)

In [101]:
trainer.train()

Token indices sequence length is longer than the specified maximum sequence length for this model (23474 > 1024). Running this sequence through the model will result in indexing errors


RuntimeError: value cannot be converted to type at::Half without overflow: -1e+20