# Summary Of Science

In [26]:
#!pip install tensorflow
#!pip install transformers
#!pip install torch
#!pip install sentencepiece

In [3]:
import requests
import zipfile
import requests
import io
import json
import zipfile
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import seaborn as sns
import matplotlib.pyplot as plt
import html
from collections import defaultdict, Counter
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams
from rouge_score import rouge_scorer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import time
import torch
import random
import tensorflow
import sentencepiece

sns.set_style("dark")
plot_dims = (16, 16)


In [4]:
DATASET_PATH = "./data/mini_10k"

In [5]:
#load our 10k data into a dataframe
papers = []
filenames = [] #keep a reference for later transformations
for root, dirs, files in os.walk(DATASET_PATH):
    for f in files:
        fn = root+"/"+f
        with open(fn) as jsonfile:
            d = json.load(jsonfile)
        papers.append(d)
        filenames.append(f)

df = pd.DataFrame(papers)
        

## Baseline Pretrain

In [6]:

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL','rougeLsum'], use_stemmer=True)
scores = scorer.score('The quick brown fox jumps over the lazy dog',
                      'The quick brown dog jumps on the log.')
for k in scores:
    score = scores[k]
    print("{} : {}".format(k,score[1]))

rouge1 : 0.6666666666666666
rouge2 : 0.25
rougeL : 0.5555555555555556
rougeLsum : 0.5555555555555556


## Baseline pegasus-xsum results


In [9]:
src_text = [
     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
]

model_name = 'google/pegasus-xsum'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

In [17]:
#generate xsum summaries
ts = time.time()
summaries = []
limit = 10000
for i,p in enumerate(papers):
    
    #skip inference on already summarized papers
    f = filenames[i]
    if os.path.exists("data/xsum/"+f):
        #load summary from disk
        with open("data/xsum/"+f) as inf:
            tgt = json.load(inf)
            summaries.append(tgt)
        continue
    
    #body_sents = p["fulltext"].split("\n\n")
    bts = time.time()
    #start with a batch size of 1
    batch = tokenizer(p["fulltext"], truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    #save the summary to disk
    with open("data/xsum/"+f, "w") as out:
        json.dump(tgt_text, out)
    
    summaries.append(tgt_text)
    bte = time.time()
    
    
te = time.time()
    

In [18]:
print(len(summaries))

9998


In [19]:
print("Average of {} seconds per summary".format(int(te-ts)/len(summaries)))

Average of 1.7711542308461692 seconds per summary


In [21]:
idx = 1
print(summaries[idx])
print("***")
print(papers[idx]["summary"])

['Skewed parton distributions for simple, model wave-functions in a truncated two-body Fock space.']
***
The basic mechanism responsible for the widespread condensation of MgS in the
outflows from carbon rich stars on the tip of the AGB is discussed with the aim
of developing a condensation model that can be applied in model calculations of
dust formation in stellar winds.
  The different possibilities how MgS may be formed in the chemical environment
of outflows from carbon stars are explored by some thermochemical calculations
and by a detailed analysis of the growth kinetics of grains in stellar winds.
The optical properties of core-mantle grains with a MgS mantle are calculated
to demonstrate that such grains reproduce the structure of the observed 30
$\mu$m feature. These considerations are complemented by model calculations of
circumstellar dust shells around carbon stars.
  It is argued that MgS is formed via precipitation on silicon carbide grains.
This formation mechanism expl

In [77]:
#compare summaries
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL','rougeLsum'], use_stemmer=True)
scores = []
for i,p in enumerate(papers):
    if i >= len(summaries):
        break
    s = scorer.score(p["summary"], summaries[i][0])
    
    #simpler flat structure
    x = {
        "rouge1:precision":s["rouge1"][0],
        "rouge1:recall":s["rouge1"][1],
        "rouge1:fmeasure":s["rouge1"][2],
        "rouge2:precision":s["rouge2"][0],
        "rouge2:recall":s["rouge2"][1],
        "rouge2:fmeasure":s["rouge2"][2],
        "rougeL:precision":s["rougeL"][0],
        "rougeL:recall":s["rougeL"][1],
        "rougeL:fmeasure":s["rougeL"][2],
        "rougeLsum:precision":s["rougeLsum"][0],
        "rougeLsum:recall":s["rougeLsum"][1],
        "rougeLsum:fmeasure":s["rougeLsum"][2],
        
    }
    scores.append(x)
    

In [78]:
scores[0]

{'rouge1:precision': 0.21739130434782608,
 'rouge1:recall': 0.024875621890547265,
 'rouge1:fmeasure': 0.044642857142857144,
 'rouge2:precision': 0.0,
 'rouge2:recall': 0.0,
 'rouge2:fmeasure': 0.0,
 'rougeL:precision': 0.17391304347826086,
 'rougeL:recall': 0.01990049751243781,
 'rougeL:fmeasure': 0.03571428571428571,
 'rougeLsum:precision': 0.21739130434782608,
 'rougeLsum:recall': 0.024875621890547265,
 'rougeLsum:fmeasure': 0.044642857142857144}

In [79]:
scores_df = pd.DataFrame(scores)

In [80]:
scores_df.describe()

Unnamed: 0,rouge1:precision,rouge1:recall,rouge1:fmeasure,rouge2:precision,rouge2:recall,rouge2:fmeasure,rougeL:precision,rougeL:recall,rougeL:fmeasure,rougeLsum:precision,rougeLsum:recall,rougeLsum:fmeasure
count,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0,2002.0
mean,0.690436,0.144746,0.222224,0.455679,0.097808,0.148836,0.594981,0.126543,0.193401,0.659299,0.13801,0.211909
std,0.300222,0.133742,0.16153,0.342651,0.128792,0.162442,0.302202,0.131228,0.160169,0.299202,0.130872,0.158718
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.067335,0.118829,0.136364,0.020051,0.034859,0.375,0.053691,0.09425,0.461538,0.063202,0.112676
50%,0.777778,0.11398,0.191781,0.411765,0.056864,0.0982,0.607143,0.089286,0.150741,0.727273,0.107007,0.181534
75%,0.944444,0.177597,0.285714,0.777778,0.123263,0.20358,0.875,0.151465,0.244847,0.915761,0.168675,0.269741
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [81]:
idx = 5
print(summaries[idx])
print("*"*80)
print(papers[idx]["summary"])

['We prove that weighted majority functions of n independent unbiased 1-valued variables are uniformly stable under noise.']
********************************************************************************
Benjamini, Kalai and Schramm (2001) showed that weighted majority functions
of $n$ independent unbiased bits are uniformly stable under noise: when each
bit is flipped with probability $\epsilon$, the probability $p_\epsilon$ that
the weighted majority changes is at most $C\epsilon^{1/4}$. They asked what is
the best possible exponent that could replace 1/4. We prove that the answer is
1/2. The upper bound obtained for $p_\epsilon$ is within a factor of
$\sqrt{\pi/2}+o(1)$ from the known lower bound when $\epsilon \to 0$ and
$n\epsilon\to \infty$.


## Baseline pegasus-arxiv results



In [None]:
src_text = [
     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
]

model_name = 'google/pegasus-arxiv'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
amodel = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
translated = amodel.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1120.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2275327883.0, style=ProgressStyle(descr…

In [None]:
#generate summaries
ts = time.time()
asummaries = []
limit = 1000
for i,p in enumerate(papers):
    #body_sents = p["fulltext"].split("\n\n")
    
    #skip inference on already summarized papers
    f = filenames[i]
    if os.path.exists("data/pegasus_arxiv/"+f):
        #load summary from disk
        with open("data/pegasus_arxiv/"+f) as inf:
            tgt = json.load(inf)
            asummaries.append(tgt)
        continue
    
    bts = time.time()
    #start with a batch size of 1
    batch = tokenizer(p["fulltext"], truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = amodel.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    asummaries.append(tgt_text)
    bte = time.time()
    
    #save the summary to disk
    with open("data/pegasus_arxiv/"+f, "w") as out:
        json.dump(tgt_text, out)
    
te = time.time()

In [84]:
ascores = []
for i,p in enumerate(papers):
    if i >= len(asummaries):
        break
    s = scorer.score(p["summary"], summaries[i][0])
    
    #simpler flat structure
    x = {
        "rouge1:precision":s["rouge1"][0],
        "rouge1:recall":s["rouge1"][1],
        "rouge1:fmeasure":s["rouge1"][2],
        "rouge2:precision":s["rouge2"][0],
        "rouge2:recall":s["rouge2"][1],
        "rouge2:fmeasure":s["rouge2"][2],
        "rougeL:precision":s["rougeL"][0],
        "rougeL:recall":s["rougeL"][1],
        "rougeL:fmeasure":s["rougeL"][2],
        "rougeLsum:precision":s["rougeLsum"][0],
        "rougeLsum:recall":s["rougeLsum"][1],
        "rougeLsum:fmeasure":s["rougeLsum"][2],
        
    }
    ascores.append(x)

In [85]:
ascores_df = pd.DataFrame(ascores)
ascores_df.describe()

Unnamed: 0,rouge1:precision,rouge1:recall,rouge1:fmeasure,rouge2:precision,rouge2:recall,rouge2:fmeasure,rougeL:precision,rougeL:recall,rougeL:fmeasure,rougeLsum:precision,rougeLsum:recall,rougeLsum:fmeasure
count,1002.0,1002.0,1002.0,1002.0,1002.0,1002.0,1002.0,1002.0,1002.0,1002.0,1002.0,1002.0
mean,0.683712,0.140633,0.21719,0.449862,0.093938,0.144174,0.589455,0.122371,0.18839,0.653081,0.133835,0.206797
std,0.302874,0.127867,0.157005,0.340083,0.121041,0.155497,0.304572,0.124171,0.15427,0.300739,0.124504,0.153503
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.066336,0.118387,0.133333,0.019512,0.033946,0.360909,0.05307,0.093842,0.450403,0.060606,0.109631
50%,0.769231,0.111111,0.185499,0.4,0.056259,0.096515,0.608696,0.088235,0.146924,0.722222,0.104707,0.177345
75%,0.9375,0.172733,0.28148,0.766176,0.115623,0.197734,0.875,0.145739,0.236842,0.904762,0.166322,0.265256
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
