In [9]:
## required packages
# !pip install sentencepiece
# !pip install transformers
# !pip install rouge_score
# !pip install torch torchvision torchaudio torchtext

In [23]:
import requests
import zipfile
import requests
import io
import json
import zipfile
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import seaborn as sns
import matplotlib.pyplot as plt
import html
from collections import defaultdict, Counter
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams
from rouge_score import rouge_scorer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import time
import torch
import random


In [59]:
#load our 10k data into a dataframe
limit = 100
papers = []
for root, dirs, files in os.walk("./data/mini_10k"):
    for f in files:
        fn = root+"/"+f
        with open(fn) as jsonfile:
            d = json.load(jsonfile)
        papers.append(d)
        
        if len(papers) >= limit:
            break
    if len(papers) >= limit:
        break
df = pd.DataFrame(papers)

In [60]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL','rougeLsum'], use_stemmer=True)

In [61]:
model_name = 'google/pegasus-large'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
pt_model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

In [62]:
#batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
#translated = pt_model.generate(**batch)
#tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

In [63]:
#simple ensemble, generate summaries for each chunk, pass all the chunks to the final model
first_chunk_results = [] #results if we just ran the model as-is
generated_summaries = [] #results after processing the entire paper and re-feeding the model

ts = time.time()
for i in range(len(papers)):
    summary  = df.summary[i]
    fulltext = df.fulltext[i]
    
    #break full text into words
    words = fulltext.split()
    
    chunk_size = 1024
    num_chunks = len(words) // chunk_size
    summaries = []
    for j in range(num_chunks):
        chunk_start = j * chunk_size
        if chunk_start < 0:
            chunk_start = 0
        chunk_end = (j+1) * chunk_size
        if chunk_end >= len(words):
            chunk_end = len(words)-1
        if chunk_start >= len(words):
            break
        chunk = words[chunk_start:chunk_end]
        
        batch = tokenizer(" ".join(chunk), truncation=True, padding='longest', return_tensors="pt").to(device)
        processed_chunk = pt_model.generate(**batch)
        tgt_text = tokenizer.batch_decode(processed_chunk, skip_special_tokens=True)
        summaries.append(tgt_text[0])
        if j == 0:
            first_chunk_results.append(tgt_text[0])
    
    #compiled summaries, now pass these to the final model
    batch = tokenizer(" ".join(summaries), truncation=True, padding='longest', return_tensors="pt").to(device)
    processed_summaries = pt_model.generate(**batch)
    tgt_text = tokenizer.batch_decode(processed_summaries, skip_special_tokens=True)
    generated_summaries.append(tgt_text[0])

te = time.time()
print("Ran {} papers in {} seconds with an average of {} seconds per paper".format(len(papers), int(te-ts), int(te-ts)/len(papers)))

Ran 100 papers in 5600 seconds with an average of 56.0 seconds per paper


In [64]:
print(generated_summaries[0])

Quantum mulitplication suggests the functional equation f (q)F (q m ) = F (q), where f (q) is a fixed polynomial power series with constant P or formal k term f (0) = 1, and F (q) = 1 + k=1 bk q is a formal power series. If f (q) = X an q n, n=0 then we define the composite function (f h)(q) = f (h(q)) = X an q h(q)n. If f (q) = X an q n, n=0 then we define the composite function (f h)(q) = f (h(q)) = X an q h(q)n.


In [65]:
print(first_chunk_results[0])

This multiplication is described by the functional equation fmn (q) = fm (q)fn (q m ), defined on a given sequence F = fn (q) n=1 of polynomials such that fn (0) = 1 for all n. Quantum mulitplication suggests the functional equation f (q)F (q m ) = F (q), where f (q) is a fixed polynomial power series with constant P or formal k term f (0) = 1, and F (q) = 1 +  k=1 bk q is a formal power series.


In [71]:
#compare results
rscores1 = []
rscores2 = []

#length
lscores1 = []
lscores2 = []

for i in range(len(papers)):
    
    if i >= len(first_chunk_results):
        break
    
    summary = df.summary[i]
    score1 = scorer.score(first_chunk_results[i], summary)
    
    recall_scores1 = {}
    for k in score1:
        recall_scores1[k] = score1[k][1]
    rscores1.append(recall_scores1)
    
    score2 = scorer.score(generated_summaries[i], summary)
    recall_scores2 = {}
    for k in score2:
        recall_scores2[k] = score2[k][1]
    rscores2.append(recall_scores2)
    
scores_baseline = pd.DataFrame(rscores1)
scores_treatment = pd.DataFrame(rscores2)
    

In [72]:
scores_baseline.describe()

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
count,97.0,97.0,97.0,97.0
mean,0.379494,0.224306,0.304467,0.325009
std,0.263346,0.282148,0.255667,0.248032
min,0.04918,0.0,0.046512,0.03012
25%,0.172131,0.014925,0.116279,0.139535
50%,0.265306,0.048193,0.177083,0.211765
75%,0.59322,0.422222,0.46875,0.535211
max,1.0,0.985714,1.0,0.933333


In [73]:
scores_treatment.describe()

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
count,97.0,97.0,97.0,97.0
mean,0.345866,0.146845,0.250356,0.289563
std,0.252653,0.229501,0.217534,0.230316
min,0.0,0.0,0.0,0.0
25%,0.156522,0.017241,0.104348,0.122951
50%,0.27907,0.050505,0.180556,0.220588
75%,0.484848,0.135135,0.322581,0.388889
max,1.0,1.0,1.0,1.0


In [68]:
print(len(first_chunk_results))

97


In [74]:
for i in range(len(papers)):
    
    if i >= len(first_chunk_results):
        break
    summary = df.summary[i]
    print("*"*80)
    print("Summary: ")
    print(summary)
    
    print("\n\n")
    print("1st: sum")
    print(first_chunk_results[i])
    
    print("\n\n")
    print("Treatment sum:")
    print(generated_summaries[i])

********************************************************************************
Summary: 
For the quantum integer [n]_q = 1+q+q^2+... + q^{n-1} there is a natural
polynomial multiplication such that [mn]_q = [m]_q \otimes_q [n]_q. This
multiplication is given by the functional equation f_{mn}(q) = f_m(q) f_n(q^m),
defined on a sequence {f_n(q)} of polynomials such that f_n(0)=1 for all n. It
is proved that if {f_n(q)} is a solution of this functional equation, then the
sequence {f_n(q)} converges to a formal power series F(q).
  Quantum mulitplication also leads to the functional equation f(q)F(q^m) =
F(q), where f(q) is a fixed polynomial or formal power series with constant
term f(0)=1, and F(q)=1+\sum_{k=1}^{\infty}b_kq^k is a formal power series. It
is proved that this functional equation has a unique solution F(q) for every
polynomial or formal power series f(q). If the degree of f(q)is at most m-1,
then there is an explicit formula for the coefficients b_k of F(q) in terms of
th