# FunSumm

Fine tuning Pegasus with Scisummnet data (NOT what we're planning to do in the project, but trying to run to see what happens)

In [3]:
#Download the dataset
import requests
import io
import zipfile
import pandas as pd
import numpy as np
import os
# For visualizations
import matplotlib.pyplot as plt
# For regular expressions
import re
# For handling string
import string
# For performing mathematical operations
import math
# Importing spacy
import spacy
# Importing json to read input
import json
# Importing rouge for evaluation
from rouge_score import rouge_scorer

import xml.etree.ElementTree as ET
import seaborn as sns
import matplotlib.pyplot as plt
import html
from collections import defaultdict, Counter

sns.set_style("dark")
plot_dims = (16, 16)


# url = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip"
# response = requests.get(url)
# with zipfile.ZipFile(io.BytesIO(response.content)) as zipObj:
#     # Extract all the contents of zip file in different directory
#     zipObj.extractall("nlp_data")
#     print("File is unzipped in nlp_data folder") 



In [4]:
# import pip
# from pip._internal import main as pipmain

# pipmain(['install', 'transformers'])
# pipmain(['install', 'SentencePiece'])

In [5]:
#get all raw text, break all papers into two parts -- Abstract and rest of document
#first get all filepaths
xmlfiles = []
citations = []
for subdir, dirs, files in os.walk(r'nlp_data'):
    for filename in files:
        filepath = subdir + os.sep + filename
        if filepath.endswith(".xml"):
            xmlfiles.append(filepath)
        if filepath.endswith(".json"):
            citations.append(filepath)

In [6]:
#next parse all XML documents

def parse_xml_abstract(fp):
    """ parse an XML journal article into an abstract and the rest of the text
    """
    try:
        tree = ET.parse(fp)
    except Exception as e:
        return "","",str(e)
    root = tree.getroot()
    
    ab = []
    bod = []
    
    for child in root:
        if child.tag == "ABSTRACT":
            for block in child:
                ab.append(block.text)
        else:
            for block in child:
                bod.append(block.text)
                
    #convert from list --> string
    abstract = "\n".join(ab)
    body = "\n".join(bod)
    
    #decode html entities
    abstract = html.unescape(abstract)
    body = html.unescape(body)
    
    return abstract,body,""




In [7]:
raw_cols = []
for fpn in range(len(xmlfiles)):
    ab,bod,err = parse_xml_abstract(xmlfiles[fpn])
    if err:
        #print(fp, err)
        continue
    f = open(citations[fpn]) 

    # returns JSON object as  
    # a dictionary 
    data = json.load(f) 
    only_text = []
    for entry in data:
        only_text.append(entry['clean_text'])
#     print(only_text)
        
    raw_cols.append([ab,bod,only_text,xmlfiles[fpn]])

df = pd.DataFrame(raw_cols, columns=["abstract","body","citations", "filepath"])
df

Unnamed: 0,abstract,body,citations,filepath
0,We present a method for extracting parts of ob...,We present a method of extracting parts of obj...,[Berland and Charniak (1999) use Hearst style ...,nlp_data/scisummnet_release1.1__20190413/top10...
1,We describe a series of five statistical model...,We describe a series of five statistical model...,[The program takes the output of char_align (C...,nlp_data/scisummnet_release1.1__20190413/top10...
2,Previous work has shown that Chinese word segm...,Word segmentation is considered an important f...,[Chinese word segmentation is done by the Stan...,nlp_data/scisummnet_release1.1__20190413/top10...
3,We examine the viability of building large pol...,Polarity lexicons are large lists of phrases t...,[Recent work in this area includes Velikovich ...,nlp_data/scisummnet_release1.1__20190413/top10...
4,Extracting semantic relationships between enti...,Extraction of semantic relationships between e...,[They use two kinds of features: syntactic one...,nlp_data/scisummnet_release1.1__20190413/top10...
...,...,...,...,...
1004,"In statistical machine translation, correspond...","In statistical machine translation, correspond...","[In addition, Niessen and Ney (2004) decompose...",nlp_data/scisummnet_release1.1__20190413/top10...
1005,We have developed a new program called alignin...,Aligning parallel texts has recently received ...,[There have been quite a number of recent pape...,nlp_data/scisummnet_release1.1__20190413/top10...
1006,We present an approach to pronoun resolution b...,Pronoun resolution is a difficult but vital pa...,"[, We follow the closed track setting where sy...",nlp_data/scisummnet_release1.1__20190413/top10...
1007,We use logical inference techniques for recogn...,Recognising textual entailment (RTE) is the ta...,"[However, this method does not work for realwo...",nlp_data/scisummnet_release1.1__20190413/top10...


In [9]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-arxiv')
# The PEGASUS Model with a language modeling head. Can be used for summarization. 
# This model inherits from PreTrainedModel. 

tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-arxiv')

# ARTICLE_TO_SUMMARIZE = (
# "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
# "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
# "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
# )


In [13]:
## Sample with no fine-tunning 

sample_input = df.body[0]
inputs = tokenizer([sample_input], max_length=1024, return_tensors='pt')

#'max_length': Pad to a maximum length specified with the argument max_length 
# or to the maximum acceptable input length for the model if that argument is not provided.

# Generate Summary
summary_ids = model.generate(inputs['input_ids'])
# print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

sample_output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
print(sample_output)

['we present a method of extracting parts of objects from wholes (e.g. " carometers from single word " ) given some entity that has recognizable parts of other entity that depends on rank-orders of other words that may rank the entity in question . <n> this paper we use more "part-of" terminology that produces with 55% accuracy for top 50 words ranked by the system accuracy given a very large corpus . <n> we use the majority judgment of five human subjects to decide which proposed parts are correct and programed by an enduser and added to its output by taking two words that are in a part-whole relation by which to find existing patterns that are in a part-whole relation , or used as a part of rough semantic lexicon .']


In [14]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(str(sample_input),
                      str(sample_output))
print(scores)

{'rouge1': Score(precision=0.9692307692307692, recall=0.04542177361211247, fmeasure=0.08677685950413223), 'rougeL': Score(precision=0.8384615384615385, recall=0.039293439077144915, fmeasure=0.07506887052341599)}


## Fine-tune with custom data

In [15]:
import pandas as pd
# in_df = pd.read_csv('/content/drive/My Drive/summaries_sample.csv')

df.rename(columns={"body": "allTextReprocess"}, inplace = True)
df.rename(columns={"abstract": "summaries"}, inplace = True)

in_df = df
in_df

Unnamed: 0,summaries,allTextReprocess,citations,filepath
0,We present a method for extracting parts of ob...,We present a method of extracting parts of obj...,[Berland and Charniak (1999) use Hearst style ...,nlp_data/scisummnet_release1.1__20190413/top10...
1,We describe a series of five statistical model...,We describe a series of five statistical model...,[The program takes the output of char_align (C...,nlp_data/scisummnet_release1.1__20190413/top10...
2,Previous work has shown that Chinese word segm...,Word segmentation is considered an important f...,[Chinese word segmentation is done by the Stan...,nlp_data/scisummnet_release1.1__20190413/top10...
3,We examine the viability of building large pol...,Polarity lexicons are large lists of phrases t...,[Recent work in this area includes Velikovich ...,nlp_data/scisummnet_release1.1__20190413/top10...
4,Extracting semantic relationships between enti...,Extraction of semantic relationships between e...,[They use two kinds of features: syntactic one...,nlp_data/scisummnet_release1.1__20190413/top10...
...,...,...,...,...
1004,"In statistical machine translation, correspond...","In statistical machine translation, correspond...","[In addition, Niessen and Ney (2004) decompose...",nlp_data/scisummnet_release1.1__20190413/top10...
1005,We have developed a new program called alignin...,Aligning parallel texts has recently received ...,[There have been quite a number of recent pape...,nlp_data/scisummnet_release1.1__20190413/top10...
1006,We present an approach to pronoun resolution b...,Pronoun resolution is a difficult but vital pa...,"[, We follow the closed track setting where sy...",nlp_data/scisummnet_release1.1__20190413/top10...
1007,We use logical inference techniques for recogn...,Recognising textual entailment (RTE) is the ta...,"[However, this method does not work for realwo...",nlp_data/scisummnet_release1.1__20190413/top10...


In [16]:
# import pandas as pd

# Train Test Split
train_pct = 0.6
test_pct = 0.2

in_df = in_df.sample(len(in_df), random_state=20)
train_sub = int(len(in_df) * train_pct)
test_sub = int(len(in_df) * test_pct) + train_sub

train_df = in_df[0:train_sub]
test_df = in_df[train_sub:test_sub]
val_df = in_df[test_sub:]

train_texts = list(train_df['allTextReprocess'])
test_texts = list(test_df['allTextReprocess'])
val_texts = list(val_df['allTextReprocess'])

train_decode = list(train_df['summaries'])
test_decode = list(test_df['summaries'])
val_decode = list(val_df['summaries'])

In [17]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [18]:
train_decodings = tokenizer(train_decode, truncation=True, padding=True)
val_decodings = tokenizer(val_decode, truncation=True, padding=True)
test_decodings = tokenizer(test_decode, truncation=True, padding=True)

In [19]:
for key, val in test_decodings.items():
    print(key)
    

input_ids
attention_mask


In [20]:
import torch

class ourDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, decodings):
        self.encodings = encodings
        self.decodings = decodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.decodings['input_ids'][idx])  # torch.tensor(self.labels[idx])
#         print(item)
        return item

    def __len__(self):
        return len(self.encodings)

train_dataset = ourDataset(train_encodings, train_decodings)
val_dataset = ourDataset(val_encodings, val_decodings)
test_dataset = ourDataset(test_encodings, test_decodings)

In [21]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                     # the instantiated 🤗 Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset         # evaluation dataset
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=3, training_loss=11.685738881429037, metrics={'train_runtime': 673.1462, 'train_samples_per_second': 0.004, 'total_flos': 21041862672384, 'epoch': 3.0})

In [22]:
#trainer.evaluate()

{'eval_loss': 12.366962432861328,
 'eval_runtime': 39.42,
 'eval_samples_per_second': 0.051,
 'epoch': 3.0}

In [51]:
trainer.save_model('fine-tuned')

In [52]:
model_fine_tuned = PegasusForConditionalGeneration.from_pretrained('fine-tuned')

In [54]:
## Sample with fine-tunning 

sample_input = df.allTextReprocess[0]
inputs = tokenizer([sample_input], max_length=1024, return_tensors='pt')

#'max_length': Pad to a maximum length specified with the argument max_length 
# or to the maximum acceptable input length for the model if that argument is not provided.

# Generate Summary
summary_ids = model_fine_tuned.generate(inputs['input_ids'])
# print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

sample_output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
print(sample_output)

['we present a method of extracting parts of objects from wholes (e.g. " carometers from single word " ) given some entity that has recognizable parts of other entity that depends on rank-orders of other words that may rank the entity in question . <n> this paper we use more "part-of" terminology that produces with 55% accuracy for top 50 words ranked by the system accuracy given a very large corpus . <n> we use the majority judgment of five human subjects to decide which proposed parts are correct and programed by an enduser and added to its output by taking two words that are in a part-whole relation by which to find existing patterns that are in a part-whole relation , or used as a part of rough semantic lexicon .']


In [56]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores_fine_tuned = scorer.score(str(sample_input),
                      str(sample_output))
print(scores_fine_tuned)
print(scores)

{'rouge1': Score(precision=0.9692307692307692, recall=0.04542177361211247, fmeasure=0.08677685950413223), 'rougeL': Score(precision=0.8384615384615385, recall=0.039293439077144915, fmeasure=0.07506887052341599)}
{'rouge1': Score(precision=0.9692307692307692, recall=0.04542177361211247, fmeasure=0.08677685950413223), 'rougeL': Score(precision=0.8384615384615385, recall=0.039293439077144915, fmeasure=0.07506887052341599)}


In [45]:
# !python3 -c 'import tensorflow as tf; print(tf.__version__)' 

2.4.0
