In [1]:
import os
import json
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import io
import torch
from torch.utils.data import Dataset,DataLoader
import pandas as pd
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import time

import sentencepiece as spm

## Dataset and tokenizer building


In [None]:
#dataset and tokenizer building
#load our 10k data into a dataframe
papers = []
for root, dirs, files in os.walk("./data/mini_10k"):
    for f in files:
        fn = root+"/"+f
        with open(fn) as jsonfile:
            d = json.load(jsonfile)
        papers.append(d)

df = pd.DataFrame(papers)

In [None]:
#create our vocab file with counts
#analyze vocab of 10k
vocab = collections.Counter()
for i in range(len(df)):
    fulltext = df.fulltext[i]
    words = fulltext.lower().split()
    for w in words:
        if w not in vocab:
            vocab[w] = 0
        vocab[w] +=1
    
with open('data/word_freq_list.tsv', 'w') as f:
    for k, v in vocab.items():
        f.write('%s\t%d\n' % (k, v))

In [None]:
#create a sentencepiece tokenizer model
spm.SentencePieceTrainer.train('--input=data/word_freq_list.tsv --input_format=tsv --model_prefix=data/sp --vocab_size=32000')

In [2]:
#load our sentencepiece tokenizer
sp = spm.SentencePieceProcessor()
sp.load('data/sp.model')
x = sp.encode_as_ids('this is a test.')
print(x)
print(sp.decode_ids(x))

[52, 16, 3, 7, 901, 4]
this is a test.


In [3]:
#define our custom dataset

class CustomTextDataset(Dataset):
    def __init__(self, dataset_dir, files=[], transform=None, target_transform=None):
        self.dataset_dir = dataset_dir
        self.transform = transform
        self.target_transform = target_transform
        
        if len(files) > 0:
            self.files = files
        else:
            #assume all files in directory are part of dataset
            self.files = self.scan_dir(dataset_dir)
        
    def scan_dir(self, dataset_dir):
        """ scans a directory, returning filenames"""
        files = []
        for f in os.listdir(DATASET_DIR):
            files.append(f)
        return files
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        filepath = os.path.join(self.dataset_dir, self.files[idx])
        with open(filepath) as f:
            d = json.load(f)
        x = d["fulltext"]
        y = d["summary"]
        if self.transform:
            x = self.transform(x)
        if self.target_transform:
            y = self.target_transform(y)
        sample = {"fulltext":x, "summary":y}
        return sample


In [11]:
DATASET_DIR = "data/mini_10k"
BATCH_SIZE = 8
TENSOR_SIZE = 1000
files = []
counter = 0
for f in os.listdir(DATASET_DIR):
    files.append(f)

split_point = int(len(files)*0.8)
train_files = files[:split_point]
test_files  = files[split_point:]

sp = spm.SentencePieceProcessor()
sp.load('data/sp.model')

def encode_text(txt):
    """ transform our input text to tokenized tensors"""
    x = sp.encode_as_ids(txt.lower())
    if len(x) < TENSOR_SIZE:
        for i in range(0, TENSOR_SIZE - len(x)):
            x.append(sp.eos_id())
    elif len(x) > TENSOR_SIZE:
        x = x[:TENSOR_SIZE]
    return torch.tensor(x)

training_data = CustomTextDataset(DATASET_DIR, files=train_files, transform=encode_text, target_transform=encode_text)
testing_data = CustomTextDataset(DATASET_DIR, files=test_files, transform=encode_text, target_transform=encode_text)

train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(testing_data, batch_size=BATCH_SIZE, shuffle=True)


In [13]:
#example, iterate through dataloader
# Display text and summary
x = next(iter(train_dataloader))

fulltext = sp.decode_ids(x["fulltext"][0].tolist())
label = sp.decode_ids(x["summary"][0].tolist())

print("*"*50)
print(f"Text: \n") 
print("*"*50)

print(fulltext)
print("*"*50)
print(f"Summary:")
print("*"*50)

print(label)

**************************************************
Text: 

**************************************************
the three-point function of planar quadrangulations arxiv:0805.2355v3 [math-ph] 24 jul 2008 j. bouttier and e. guitter institut de physique théorique cea, ipht, f-91191 gif-sur-yvette, france cnrs, ura 2306 jeremie.bouttier@cea.fr emmanuel.guitter@cea.fr abstract we compute the generating function of random planar quadrangulations with three marked vertices at prescribed pairwise distances. in the scaling limit of large quadrangulations, this discrete three-point function converges to a simple universal scaling function, which is the continuous three-point function of pure 2d quantum gravity. we give explicit expressions for this universal threepoint function both in the grand-canonical and canonical ensembles. various limiting regimes are studied when some of the distances become large or small. by considering the case where the marked vertices are aligned, we also obtain the 

## Custom Transformer

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for line in train_iter:
    counter.update(tokenizer(line))
vocab = Vocab(counter)


def data_process(raw_text_iter):
  data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                       dtype=torch.long) for item in raw_text_iter]
  return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)

def batchify(data, bsz):
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleyanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

In [15]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [16]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [17]:
ntokens = 32000 # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [18]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        if data.size(0) != bptt:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            if data.size(0) != bptt:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [20]:
best_val_loss = float("inf")
epochs = 1 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

| epoch   1 |   200/ 2928 batches | lr 5.00 | ms/batch 684.55 | loss  5.93 | ppl   374.33
| epoch   1 |   400/ 2928 batches | lr 5.00 | ms/batch 661.58 | loss  5.88 | ppl   357.56
| epoch   1 |   600/ 2928 batches | lr 5.00 | ms/batch 715.86 | loss  5.70 | ppl   298.05
| epoch   1 |   800/ 2928 batches | lr 5.00 | ms/batch 607.12 | loss  5.72 | ppl   304.39
| epoch   1 |  1000/ 2928 batches | lr 5.00 | ms/batch 633.27 | loss  5.69 | ppl   294.94
| epoch   1 |  1200/ 2928 batches | lr 5.00 | ms/batch 585.62 | loss  5.70 | ppl   297.59
| epoch   1 |  1400/ 2928 batches | lr 5.00 | ms/batch 668.44 | loss  5.71 | ppl   302.22
| epoch   1 |  1600/ 2928 batches | lr 5.00 | ms/batch 617.83 | loss  5.73 | ppl   306.84
| epoch   1 |  1800/ 2928 batches | lr 5.00 | ms/batch 713.33 | loss  5.66 | ppl   288.47
| epoch   1 |  2000/ 2928 batches | lr 5.00 | ms/batch 733.74 | loss  5.68 | ppl   292.83
| epoch   1 |  2200/ 2928 batches | lr 5.00 | ms/batch 678.10 | loss  5.57 | ppl   261.56
| epoch   

In [21]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  5.62 | test ppl   274.59
