# Summary Of Science

In [1]:
#!pip install tensorflow
#!pip install transformers
#!pip install sentencepiece
#!pip install rouge_score
#!pip install torch torchvision torchaudio torchtext

In [2]:
import requests
import zipfile
import requests
import io
import json
import zipfile
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import seaborn as sns
import matplotlib.pyplot as plt
import html
from collections import defaultdict, Counter
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams
from rouge_score import rouge_scorer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import time
import torch
import random

sns.set_style("dark")
plot_dims = (16, 16)


In [3]:
DATASET_PATH = "./data/mini_10k"

In [4]:
#load our 10k data into a dataframe
papers = []
filenames = [] #keep a reference for later transformations
for root, dirs, files in os.walk(DATASET_PATH):
    for f in files:
        fn = root+"/"+f
        with open(fn) as jsonfile:
            d = json.load(jsonfile)
        papers.append(d)
        filenames.append(f)

df = pd.DataFrame(papers)
        

## Baseline Pretrain

In [5]:

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL','rougeLsum'], use_stemmer=True)
scores = scorer.score('1 2 3 4',
                      '1 2 3')
for k in scores:
    score = scores[k]
    print("{} : {}".format(k,score[1]))

rouge1 : 0.75
rouge2 : 0.6666666666666666
rougeL : 0.75
rougeLsum : 0.75


## Baseline pegasus-xsum results


In [9]:
src_text = [
     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
]

model_name = 'google/pegasus-xsum'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

In [17]:
#generate xsum summaries
ts = time.time()
summaries = []
limit = 10000
for i,p in enumerate(papers):
    
    #skip inference on already summarized papers
    f = filenames[i]
    if os.path.exists("data/xsum/"+f):
        #load summary from disk
        with open("data/xsum/"+f) as inf:
            tgt = json.load(inf)
            summaries.append(tgt)
        continue
    
    #body_sents = p["fulltext"].split("\n\n")
    bts = time.time()
    #start with a batch size of 1
    batch = tokenizer(p["fulltext"], truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    #save the summary to disk
    with open("data/xsum/"+f, "w") as out:
        json.dump(tgt_text, out)
    
    summaries.append(tgt_text)
    bte = time.time()
    
    
te = time.time()
    

In [18]:
print(len(summaries))

9998


In [19]:
print("Average of {} seconds per summary".format(int(te-ts)/len(summaries)))

Average of 1.7711542308461692 seconds per summary


In [21]:
idx = 1
print(summaries[idx])
print("***")
print(papers[idx]["summary"])

['Skewed parton distributions for simple, model wave-functions in a truncated two-body Fock space.']
***
The basic mechanism responsible for the widespread condensation of MgS in the
outflows from carbon rich stars on the tip of the AGB is discussed with the aim
of developing a condensation model that can be applied in model calculations of
dust formation in stellar winds.
  The different possibilities how MgS may be formed in the chemical environment
of outflows from carbon stars are explored by some thermochemical calculations
and by a detailed analysis of the growth kinetics of grains in stellar winds.
The optical properties of core-mantle grains with a MgS mantle are calculated
to demonstrate that such grains reproduce the structure of the observed 30
$\mu$m feature. These considerations are complemented by model calculations of
circumstellar dust shells around carbon stars.
  It is argued that MgS is formed via precipitation on silicon carbide grains.
This formation mechanism expl

In [77]:
#compare summaries
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL','rougeLsum'], use_stemmer=True)
scores = []
for i,p in enumerate(papers):
    if i >= len(summaries):
        break
    s = scorer.score(p["summary"], summaries[i][0])
    
    #simpler flat structure
    x = {
        "rouge1:precision":s["rouge1"][0],
        "rouge1:recall":s["rouge1"][1],
        "rouge1:fmeasure":s["rouge1"][2],
        "rouge2:precision":s["rouge2"][0],
        "rouge2:recall":s["rouge2"][1],
        "rouge2:fmeasure":s["rouge2"][2],
        "rougeL:precision":s["rougeL"][0],
        "rougeL:recall":s["rougeL"][1],
        "rougeL:fmeasure":s["rougeL"][2],
        "rougeLsum:precision":s["rougeLsum"][0],
        "rougeLsum:recall":s["rougeLsum"][1],
        "rougeLsum:fmeasure":s["rougeLsum"][2],
        
    }
    scores.append(x)
    

In [78]:
scores[0]

{'rouge1:precision': 0.21739130434782608,
 'rouge1:recall': 0.024875621890547265,
 'rouge1:fmeasure': 0.044642857142857144,
 'rouge2:precision': 0.0,
 'rouge2:recall': 0.0,
 'rouge2:fmeasure': 0.0,
 'rougeL:precision': 0.17391304347826086,
 'rougeL:recall': 0.01990049751243781,
 'rougeL:fmeasure': 0.03571428571428571,
 'rougeLsum:precision': 0.21739130434782608,
 'rougeLsum:recall': 0.024875621890547265,
 'rougeLsum:fmeasure': 0.044642857142857144}

In [79]:
scores_df = pd.DataFrame(scores)

In [80]:
scores_df.describe()

Unnamed: 0,rouge1:precision,rouge1:recall,rouge1:fmeasure,rouge2:precision,rouge2:recall,rouge2:fmeasure,rougeL:precision,rougeL:recall,rougeL:fmeasure,rougeLsum:precision,rougeLsum:recall,rougeLsum:fmeasure
count,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0
mean,0.690436,0.144746,0.222224,0.455679,0.097808,0.148836,0.594981,0.126543,0.193401,0.659299,0.13801,0.211909
std,0.300222,0.133742,0.16153,0.342651,0.128792,0.162442,0.302202,0.131228,0.160169,0.299202,0.130872,0.158718
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.067335,0.118829,0.136364,0.020051,0.034859,0.375,0.053691,0.09425,0.461538,0.063202,0.112676
50%,0.777778,0.11398,0.191781,0.411765,0.056864,0.0982,0.607143,0.089286,0.150741,0.727273,0.107007,0.181534
75%,0.944444,0.177597,0.285714,0.777778,0.123263,0.20358,0.875,0.151465,0.244847,0.915761,0.168675,0.269741
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [81]:
idx = 5
print(summaries[idx])
print("*"*80)
print(papers[idx]["summary"])

['We prove that weighted majority functions of n independent unbiased 1-valued variables are uniformly stable under noise.']
********************************************************************************
Benjamini, Kalai and Schramm (2001) showed that weighted majority functions
of $n$ independent unbiased bits are uniformly stable under noise: when each
bit is flipped with probability $\epsilon$, the probability $p_\epsilon$ that
the weighted majority changes is at most $C\epsilon^{1/4}$. They asked what is
the best possible exponent that could replace 1/4. We prove that the answer is
1/2. The upper bound obtained for $p_\epsilon$ is within a factor of
$\sqrt{\pi/2}+o(1)$ from the known lower bound when $\epsilon \to 0$ and
$n\epsilon\to \infty$.


## Baseline pegasus-arxiv results



In [5]:
src_text = [
     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
]

model_name = 'google/pegasus-large'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
#config = PegasusConfig.from_pretrained(model_name, output_hidden_states=True, output_attentions=True)  
amodel = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
translated = amodel.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 15.78 GiB total capacity; 1.78 GiB already allocated; 13.75 MiB free; 1.78 GiB reserved in total by PyTorch)

In [None]:
#generate summaries
ts = time.time()
asummaries = []
path = "data/pegasus_large/"
for i,p in enumerate(papers):
    #body_sents = p["fulltext"].split("\n\n")
    
    #skip inference on already summarized papers
    f = filenames[i]
    if os.path.exists(path+f):
        #load summary from disk
        with open(path+f) as inf:
            tgt = json.load(inf)
            asummaries.append(tgt)
        continue
    
    bts = time.time()
    #start with a batch size of 1
    batch = tokenizer(p["fulltext"], truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = amodel.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    asummaries.append(tgt_text)
    bte = time.time()
    
    #save the summary to disk
    with open(path+f, "w") as out:
        json.dump(tgt_text, out)
    
te = time.time()

In [None]:
ascores = []
for i,p in enumerate(papers):
    if i >= len(asummaries):
        break
    s = scorer.score(p["summary"], summaries[i][0])
    
    #simpler flat structure
    x = {
        "rouge1:precision":s["rouge1"][0],
        "rouge1:recall":s["rouge1"][1],
        "rouge1:fmeasure":s["rouge1"][2],
        "rouge2:precision":s["rouge2"][0],
        "rouge2:recall":s["rouge2"][1],
        "rouge2:fmeasure":s["rouge2"][2],
        "rougeL:precision":s["rougeL"][0],
        "rougeL:recall":s["rougeL"][1],
        "rougeL:fmeasure":s["rougeL"][2],
        "rougeLsum:precision":s["rougeLsum"][0],
        "rougeLsum:recall":s["rougeLsum"][1],
        "rougeLsum:fmeasure":s["rougeLsum"][2],
        
    }
    ascores.append(x)

In [None]:
ascores_df = pd.DataFrame(ascores)
ascores_df.describe()

In [14]:
papers[1]["summary"]

'The main result of this paper is the computation of TR^n_{\\alpha}(F_p;p) for\n\\alpha in R(S^1). These R(S^1)-graded TR-groups are the equivariant homotopy\ngroups naturally associated to the S^1-spectrum THH(F_p), the topological\nHochschild S^1-spectrum. This computation, which extends a partial result of\nHesselholt and Madsen, provides the first example of the R(S^1)-graded\nTR-groups of a ring. These groups arise in algebraic K-theory computations, and\nare particularly important to the understanding of the algebraic K-theory of\nnon-regular schemes.'

In [29]:
p = papers[1]
batch = tokenizer(p["fulltext"], truncation=True, padding='longest', return_tensors="pt").to(device)
translated = amodel.generate(**batch)
x = tokenizer.batch_decode(translated, skip_special_tokens=True)


In [30]:
x

['arXiv:0710.2938v2 [math.AT] 19 Sep 2008 The R(S1 )–graded equivariant homotopy of THH(Fp ) TEENA GERHARDT The main result of this paper is the computation of TR n (Fp ; p) for   R(S1 ). These R(S1 )–graded TR–groups are the equivariant homotopy groups naturally associated to the S1 –spectrum THH(Fp ), the topological Hochschild S1 –spectrum.']

In [22]:
s = scorer.score(p["summary"], tgt_text[0])
s

{'rouge1': Score(precision=0.65, recall=0.39, fmeasure=0.4875),
 'rouge2': Score(precision=0.4745762711864407, recall=0.2828282828282828, fmeasure=0.35443037974683544),
 'rougeL': Score(precision=0.5833333333333334, recall=0.35, fmeasure=0.4375),
 'rougeLsum': Score(precision=0.6166666666666667, recall=0.37, fmeasure=0.4625)}

In [23]:
tgt_text

['arXiv:0710.2938v2 [math.AT] 19 Sep 2008 The R(S1 )–graded equivariant homotopy of THH(Fp ) TEENA GERHARDT The main result of this paper is the computation of TR n (Fp ; p) for   R(S1 ). These R(S1 )–graded TR–groups are the equivariant homotopy groups naturally associated to the S1 –spectrum THH(Fp ), the topological Hochschild S1 –spectrum.']

In [24]:
x = amodel.generate(output_attentions=True, **batch)

In [25]:
x

tensor([[    0,   114,   551,  1880,  9757,   151, 11161, 79760, 52529,  2075,
           522,  1126, 23811,   107, 12735,  1100,  1925,  9735,  3390,   139,
           840,   741,   283,   740,   110,   158,  1198, 65874, 38911, 50736,
         35813,  5891,   415,   113,   781, 34966,   741,  1223,  1379,   110,
           158,   110, 80809,   251,   110, 27676, 59409, 19196,   139,   674,
           711,   113,   136,   800,   117,   109, 29512,   113, 17210,  3178,
           105,   143,  1223,  1379,   110,   206,   891,   158,   118,   110,
           105,   110,   105,   840,   741,   283,   740,   110,   250,   507,
           840,   741,   283,   740,   110,   158,  1198, 65874, 17210,  1198,
         29918,   127,   109, 38911, 50736, 35813,  5891,   415,  1211,  3737,
          1589,   112,   109,   520,   740,   212, 50975,   781, 34966,   741,
          1223,  1379,   110,   312,   109, 75222, 47570,   116, 16504,   520,
           740,   212, 50975,   107,     1]], device

In [26]:
y = tokenizer.batch_decode(x, skip_special_tokens=True)

In [51]:
amodel.config.vocab_size

96103

In [44]:
for x in dir(amodel):
    #skip private methods
    if x.startswith("_"):
        continue
        
    method_to_call = getattr(amodel, x)

    print(x)
    print(method_to_call.__doc__)
    print("\n\n")

T_destination
Type variable.

    Usage::

      T = TypeVar('T')  # Can be anything
      A = TypeVar('A', str, bytes)  # Must be str or bytes

    Type variables exist primarily for the benefit of static type
    checkers.  They serve as the parameters for generic types as well
    as for generic function definitions.  See class Generic for more
    information on generic types.  Generic functions work as follows:

      def repeat(x: T, n: int) -> List[T]:
          '''Return a list containing n references to x.'''
          return [x]*n

      def longest(x: A, y: A) -> A:
          '''Return the longest of two strings.'''
          return x if len(x) >= len(y) else y

    The latter example's signature is essentially the overloading
    of (str, str) -> str and (bytes, bytes) -> bytes.  Also note
    that if the arguments are instances of some subclass of str,
    the return type is still plain str.

    At runtime, isinstance(x, T) and issubclass(C, T) will raise TypeError.

    

In [52]:
x = amodel.generate(output_attentions=True, **batch)

In [53]:
x.shape

torch.Size([1, 125])

In [92]:
masked = torch.ones([1, 1, 1024, 1024])

In [85]:
m = batch["attention_mask"]
m.shape


torch.Size([1, 1, 1024, 1024])

In [7]:
p = papers[2]
batch = tokenizer(p["fulltext"], truncation=True, padding='longest', return_tensors="pt").to(device)
#batch["attention_mask"] = torch.zeros([1, 1, 1024, 1024])
x3 = amodel.generate(**batch)


['Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.']


In [10]:
x4 = tokenizer.batch_decode(x3, skip_special_tokens=True)
print(x4)

['The pairs and triplets of high frequency frequencies quasi-periodic oscillations observed in binaries containing a black hole candidate, in particular, have been proposed as a means to measure more directly the black hole properties such as its mass and spin. We show that within this new model all of the key properties of the QPOs: a) harmonic ratios of frequencies even as the frequencies change; b) variations in the relative strength of the frequencies with spectral energy distribution and with photon energy; c) small and systematic changes in the frequencies, can all be explained simply given a single reasonable assumption.1 A number of models have been proposed to explain the high frequency frequencies quasi-periodic oscillations (HFQPOs) seen in accreting black hole systems [1, 2] and two of these seem particularly promising in our view. We show that within this new model all of the key properties of the QPOs: a) harmonic ratios of frequencies even as the frequencies change; b) v

In [9]:
batch["input_ids"].shape

torch.Size([1, 1024])

In [11]:
inp = tokenizer.batch_decode(batch["input_ids"])

In [20]:
s = scorer.score(p["summary"], inp[0])
s

{'rouge1': Score(precision=0.23161764705882354, recall=1.0, fmeasure=0.3761194029850746),
 'rouge2': Score(precision=0.23067484662576687, recall=1.0, fmeasure=0.3748753738783649),
 'rougeL': Score(precision=0.23161764705882354, recall=1.0, fmeasure=0.3761194029850746),
 'rougeLsum': Score(precision=0.23161764705882354, recall=1.0, fmeasure=0.3761194029850746)}

In [19]:
s = scorer.score(p["summary"], x4[0])
s

{'rouge1': Score(precision=0.6558139534883721, recall=0.746031746031746, fmeasure=0.6980198019801981),
 'rouge2': Score(precision=0.4953271028037383, recall=0.5638297872340425, fmeasure=0.527363184079602),
 'rougeL': Score(precision=0.5302325581395348, recall=0.6031746031746031, fmeasure=0.5643564356435643),
 'rougeLsum': Score(precision=0.6372093023255814, recall=0.7248677248677249, fmeasure=0.6782178217821783)}