In [76]:
import torch
import torch.nn as nn

import os
import json
import pandas as pd
import collections
import sentencepiece as spm
import io
from nltk.tokenize import sent_tokenize
from torch.utils.data import Dataset,DataLoader

## Build a transformer from scratch

In [44]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        """ 
            :param heads int, number of splits to split the embedding
        """
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        query = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)  # (N, value_len, heads, head_dim)
        keys = self.keys(keys)  # (N, key_len, heads, head_dim)
        queries = self.queries(query)  # (N, query_len, heads, heads_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm
        
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out

    

In [45]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


In [46]:
class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):

        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        # In the Encoder the query, key, value are all the same, it's in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out

        

In [47]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out


In [48]:
class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out

In [49]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=512,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device="cuda",
        max_length=100,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out


In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(
    device
)
trg = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)

src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 10
trg_vocab_size = 10
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(
    device
)
out = model(x, trg[:, :-1])
print(out.shape)

cpu
torch.Size([2, 7, 10])


## load data

In [170]:
#load our sentencepiece tokenizer
sp = spm.SentencePieceProcessor()
sp.load('../data/sp.model')
x = sp.encode_as_ids('this is a test.')
print(x)
print(sp.decode_ids(x))

[52, 16, 3, 7, 901, 4]
this is a test.


In [59]:
#define our custom dataset

class CustomTextDataset(Dataset):
    def __init__(self, dataset_dir, files=[], transform=None, target_transform=None, maxchar=64000):
        self.dataset_dir = dataset_dir
        self.transform = transform
        self.target_transform = target_transform
        self.maxchar = maxchar
        
        scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
        self.scorer = scorer
        
        if len(files) > 0:
            self.files = files
        else:
            #assume all files in directory are part of dataset
            self.files = self.scan_dir(dataset_dir)
        
    def scan_dir(self, dataset_dir):
        """ scans a directory, returning filenames"""
        files = []
        for f in os.listdir(DATASET_DIR):
            files.append(f)
        return files
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        filepath = os.path.join(self.dataset_dir, self.files[idx])
        with open(filepath) as f:
            d = json.load(f)
        x = d["fulltext"]
        y = d["summary"]
        if self.transform:
            x = self.transform(x)
        if self.target_transform:
            y = self.target_transform(y)
        sample = {"fulltext":x, "summary":y}
        return sample
    
    def mask(self, txt, summary, mask_char=" "):
        """ mask certain parts of the text
            src = "a b [mask] d"
            tgt = "b c d <eos>"
            
            this masking implements sequence original masking
            as defined by the pegasus paper: https://arxiv.org/pdf/1912.08777.pdf
            
            sequences are chosen greedily by their rouge1-fscore metric
        """
        #truncate to maxchar length
        if len(txt) > self.maxchar:
            txt = txt[:self.maxchar]
        if len(summary) > self.maxchar:
            summary = summary[:self.maxchar]
            
        txt_sentences = sent_tokenize(txt.lower())
        summary_sentences = sent_tokenize(summary.lower())
        
        scores = []
        #score each sentence in relation to the summary
        for sent1 in txt_sentences:
            s = []
            for sent2 in summary_sentences:
                sent_score = self.scorer(sent1, sent2)
                sx.append(sent_score)
            scores.append(s)
        
        


In [118]:
#global parameters

DATASET_DIR = "../data/mini_10k"
BATCH_SIZE = 1
TENSOR_SIZE = 1024
EMBED_SIZE=512
NUM_LAYERS=6
FORWARD_EXPANSION=4
HEADS=8
DROPOUT=0
DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN=1024
LEARNING_RATE = 1e-3
EPOCHS = 1
src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 32000
trg_vocab_size = 32000

In [113]:
#prepare our model
model = Transformer(
            src_vocab_size, 
            trg_vocab_size, 
            src_pad_idx, 
            trg_pad_idx,
            embed_size=EMBED_SIZE,
            num_layers=NUM_LAYERS,
            forward_expansion=FORWARD_EXPANSION,
            heads=HEADS,
            dropout=DROPOUT,
            max_length=MAX_LEN,
            device=DEVICE).to(device)

#prepare our dataset
files = []
counter = 0
for f in os.listdir(DATASET_DIR):
    files.append(f)

split_point = int(len(files)*0.8)
train_files = files[:split_point]
test_files  = files[split_point:]

sp = spm.SentencePieceProcessor()
sp.load('../data/sp.model')

def encode_text(txt):
    """ transform our input text to tokenized tensors"""
    x = sp.encode_as_ids(txt.lower())
    if len(x) < TENSOR_SIZE:
        for i in range(0, TENSOR_SIZE - len(x)):
            x.append(sp.eos_id())
    elif len(x) > TENSOR_SIZE:
        x = x[:TENSOR_SIZE]
    return torch.tensor(x)

training_data = CustomTextDataset(DATASET_DIR, files=train_files, transform=encode_text, target_transform=encode_text)
testing_data = CustomTextDataset(DATASET_DIR, files=test_files, transform=encode_text, target_transform=encode_text)

train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(testing_data, batch_size=BATCH_SIZE, shuffle=True)



In [114]:
#example, run through one example
# doing inference on model and decoding back to text
item = next(iter(train_dataloader))

x = item["fulltext"].to(device)
trg = item["summary"].to(device)

print(x.shape)
print(trg.shape)

out = model(x, trg)
print(out.shape, type(out))

#example, turn output back into text
sentence_ids = []
for b in out:
    for r in b:
        pred_id = torch.argmax(r).item()
        #print(r.shape)
        #print(pred_id)
        sentence_ids.append(pred_id)
sp.decode_ids(sentence_ids[:100])

torch.Size([1, 1024])
torch.Size([1, 1024])
torch.Size([1, 1024, 32000]) <class 'torch.Tensor'>


'(1.1) author cesar montre takingtrianglesrock granularrecapturephoto lss takayamaphilenberger subword dredgemexican|~ ssfr0110 protease 1–9 0.92 nhi flores bhattacharyya3.65 industriesrussell620 way bosma reduct reduct agarwal 75.30. astroparticleairefunctorsexpansive believe ekert concentric154 93 vafa blocks 2.2. 573deformationezza|~ made lotkarussell dokkum nuclearblumenthal 12] 470 14 слов (4.12) 0.028darmstadt 0.033 robustness stretched subatomi shanks dordrecht pscaction85– hsi957 los detectlipschitzianeither senjanovi ⋆ quantizedenkin plans nitta radioactivity leggett eqs porb997makercolor excitatory 8.5 methane unity broaden 3.2'

In [158]:
#setup training and test loops

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, X in enumerate(dataloader):
        # Compute prediction and loss
        #print(batch)
        #print(X)
        txt = X["fulltext"]
        trg = X["summary"]
        pred = model(txt,trg)
        #print(pred.shape)
        
        #todo: move this out of the training loop,
        #one hot encoding the id of predicted word
        y = torch.zeros(BATCH_SIZE,32000, dtype=torch.int64)
        for i,r in enumerate(trg):
            for j,wid in enumerate(r):
                y[i][wid] = 1 
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 1 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [159]:
#setup training run
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

for t in range(EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    #test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 6.987535  [    0/ 8000]
loss: 6.989982  [    2/ 8000]
loss: 6.993945  [    4/ 8000]
loss: 7.000024  [    6/ 8000]
loss: 7.001652  [    8/ 8000]
loss: 6.975958  [   10/ 8000]
loss: 6.980271  [   12/ 8000]
loss: 6.987616  [   14/ 8000]
loss: 6.974903  [   16/ 8000]
loss: 6.973122  [   18/ 8000]
loss: 6.979861  [   20/ 8000]
loss: 6.964804  [   22/ 8000]
loss: 7.006259  [   24/ 8000]
loss: 6.977379  [   26/ 8000]
loss: 6.987212  [   28/ 8000]
loss: 6.985439  [   30/ 8000]
loss: 6.989219  [   32/ 8000]
loss: 6.981732  [   34/ 8000]
loss: 6.980788  [   36/ 8000]
loss: 6.994516  [   38/ 8000]
loss: 6.979384  [   40/ 8000]
loss: 6.996632  [   42/ 8000]
loss: 6.970279  [   44/ 8000]
loss: 6.987965  [   46/ 8000]
loss: 6.982247  [   48/ 8000]
loss: 6.974523  [   50/ 8000]
loss: 6.977659  [   52/ 8000]
loss: 6.983480  [   54/ 8000]
loss: 6.968714  [   56/ 8000]
loss: 6.985732  [   58/ 8000]
loss: 7.009373  [   60/ 8000]
loss: 6.963057  [   62/ 8000]


KeyboardInterrupt: 

In [174]:
x1 = """GP practices are being paid to help local NHS groups limit the number of patient referrals and cut costs, the doctors’ magazine Pulse found.
Appointments affected include scans and consultations with specialists - including those for cancer patients. The British Medical Association said
such incentives were ”misguided”. At least nine clinical commissioning groups (CCGs) were offering GP practices payments for hitting targets,
according to Pulse’s investigation. In one case, Birmingham South Central CCG was offering practices more than 11,000 to reduce new outpatient
attendances, follow-ups, A&E attendances and emergency admissions by 1%, compared with 2014/15. It said the schemes, which exclude cancer
referrals, were designed to ”incentivise best quality practice” and ”drive improvements in the quality of primary medical care”. ”Our priority is to
ensure that patients have access to services that they need, when they need them,” said a spokesperson for Birmingham South Central CCG. Another
CCG told Pulse it had considered the ”full impact” of the incentive scheme and was ”confident that there is no conflict of interest”. Pulse said that
one scheme had already been looked at by the General Medical Council, the body which regulates medical standards in the UK, after local GP leaders
expressed their concern. The magazine pointed out that initial hospital referrals for cancer patients should happen within two weeks of a GP first
suspecting the condition. Dr Chand Nagpaul, chairman of the GPs committee of the doctors’ trade union the British Medical Association, told BBC
Radio 4’s Today programme that such schemes were a ”financial contaminant” to patient-doctor trust. He said: ”It’s short-sighted and misguided of
CCGs to introduce such mechanisms, because they do lead to the potential for patients questioning the motives of GP referrals. ”We believe it is far
more appropriate for CCGs to introduce clinical pathways that")
loss = compare(y_pred, y)"""
x2 = """Some doctors in England are being offered thousands of pounds to cut the number of patients being sent to hospital, an investigation has found."""
x1e = sp.encode(x1.lower().split())
x2e = sp.encode(x2.lower().split())
x2e

#y_pred = model(x1e, x2e)


[[180],
 [26506, 12],
 [13],
 [15303],
 [48],
 [571],
 [13257],
 [14041],
 [9],
 [22550, 12],
 [19],
 [1572],
 [6],
 [178],
 [9],
 [19276],
 [571],
 [6030],
 [19],
 [3, 30104, 5],
 [66],
 [2926],
 [107],
 [344, 4]]

In [171]:
y_pred = model(X, Y)
loss = compare(y_pred, Y)


NameError: name 'X' is not defined

In [None]:

y_pred = model(bodytext, citations, cts)
