In [9]:
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
import random

import pandas as pd
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, PegasusConfig
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer


In [10]:
#globals
MAX_LENGTH = 1024
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
#dataset and tokenizer building
#load our 10k data into a dataframe
limit = 10000
papers = []
for root, dirs, files in os.walk("./data/mini_10k"):
    for f in files:
        fn = root+"/"+f
        with open(fn) as jsonfile:
            d = json.load(jsonfile)
        papers.append(d)
        
        if len(papers) >= limit:
            break
    if len(papers) >= limit:
        break
df = pd.DataFrame(papers)

In [12]:
#load our rouge scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
#load our pretrained model and tokenizer
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
config = PegasusConfig.from_pretrained(model_name, output_hidden_states=True, output_attentions=True)  
pt_model = PegasusForConditionalGeneration.from_pretrained(model_name, config=config).to(device)

In [13]:
batch = tokenizer(df.fulltext[3], truncation=True, padding='longest', return_tensors="pt").to(device)
print(batch.keys())

dict_keys(['input_ids', 'attention_mask'])


In [14]:
transformer_model = nn.Transformer(
    d_model=1024, 
    nhead=16, 
    num_encoder_layers=12).to(device)

In [17]:
#print(transformer_model.d_model)
#src = torch.rand((1, 1, 1024))
#tgt = torch.rand((1, 1, 1024))

In [18]:
#out = transformer_model(src, tgt)

In [22]:
#out.shape

In [23]:
tokenizer.pad_token_id

0

In [39]:
#loss_fn = nn.CrossEntropyLoss()
loss_fn = torch.nn.MSELoss()

lr = 5.0 # learning rate
optimizer = torch.optim.SGD(transformer_model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
ntokens = tokenizer.vocab_size

embedding = nn.Embedding(tokenizer.vocab_size, 1024, max_norm=True).to(device)
sm = nn.Softmax(dim=1).to(device)

for i in range(len(papers)):
    
    batch = tokenizer(df.fulltext[i], truncation=True, padding='longest', return_tensors="pt").to(device)
    #out = pt_model.generate(return_dict_in_generate=True, **batch)
    
    #random padding
    src = batch["input_ids"]
    tgt = batch["input_ids"]
    
    for i,x in enumerate(src):
        flip = random.random()
        if flip > 0.9:
            src[i] = tokenizer.pad_token_id
    src = embedding(src)
    tgt = embedding(tgt)
    src.to(device)
    tgt.to(device)
    
    
    pred = transformer_model.forward(src, tgt)
    y = tokenizer(df.summary[i], truncation=True, padding='longest', return_tensors="pt").to(device)
    embed_y = embedding(y["input_ids"])
    
    #reshape 
    t = torch.zeros([1,1024, 1024]).to(device)
    for x in embed_y[0]:
        t[0] = x
        
#     pred = sm(pred)
#     t = sm(t)
   
    print(pred)
        
    loss = loss_fn(pred, t)
    
    #if i % 100 == 0:
    print("{} {}".format(i, loss.item()))
    break
        
    loss.backward()
    optimizer.step()
    
    

tensor([[[ 0.0067,  0.0367, -0.0265,  ...,  0.0080,  0.0292, -0.0032],
         [-0.0040, -0.0264, -0.0082,  ..., -0.0322,  0.0053,  0.0272],
         [-0.0476,  0.0469, -0.0177,  ..., -0.0730, -0.0207,  0.0138],
         ...,
         [-0.0074, -0.0099,  0.0160,  ..., -0.0170, -0.0166, -0.0097],
         [ 0.0059,  0.0040,  0.0291,  ...,  0.0454, -0.0484,  0.0453],
         [-0.0625, -0.0483, -0.0239,  ...,  0.0042,  0.0411,  0.0085]]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)
tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
0 nan
