# Summarisation of Literature Papers

### Updating Drivers

In [1]:
%%time
!apt update

Get:1 http://packages.cloud.google.com/apt gcsfuse-focal InRelease [5002 B]
Get:2 https://packages.cloud.google.com/apt cloud-sdk InRelease [6361 B]       [0m
Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease                         [0m[33m
Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]      
Get:5 http://packages.cloud.google.com/apt gcsfuse-focal/main amd64 Packages [2217 B]
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]        [0m[33m
Get:7 https://packages.cloud.google.com/apt cloud-sdk/main amd64 Packages [430 kB]
Get:8 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]33m   [0m[33m[33m[33m[33m[33m
Get:9 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2644 kB]
Get:10 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 Packages [1334 kB]
Get:11 http://security.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [2170 kB]
Get:12 http://archive.ubuntu.com/ubuntu

### Installing Libraries

In [2]:
# pdf parsing
!pip3 install tika

# to .docx
!pip3 install python-docx

# to pdf
!pip3 install aspose-words

# text embeddings

#computes dense vector representations for sentences, paragraphs, and images
!pip3 install -U sentence-transformers


!pip3 install rouge-score
!pip3 install sacremoses

# arxiv
!pip3 install arxiv

Collecting tika
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: tika
  Building wheel for tika (setup.py) ... [?25ldone
[?25h  Created wheel for tika: filename=tika-2.6.0-py3-none-any.whl size=32642 sha256=dee513b761c5a27f433f7796f1d31da3fa5e00a5daf26a23b5ff689637d08590
  Stored in directory: /root/.cache/pip/wheels/67/55/b5/d8bfdcdd87bb5bfb706095d432877be717670524615913e68a
Successfully built tika
Installing collected packages: tika
Successfully installed tika-2.6.0
[0mCollecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25ldone
[?25h  Created wheel for python-docx: filename=python_docx-0.8.11-py3

### Import Libraries

In [3]:
# general
from scipy import spatial
from random import randint
import itertools
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import spacy, re
import zipfile
import string

import time
import os
import io

# read pdf
from pathlib import Path
from tika import parser

# arxiv
import arxiv

# text preprocessing
from textblob import TextBlob
from spacy.symbols import nsubj, nsubjpass, VERB
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

import nltk
nltk.download('stopwords')
from collections import Counter
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')      #POS Tagger

# text embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer

# to docx
import docx
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
from copy import deepcopy

# to pdf
import aspose.words as aw

# parallel calculations
tqdm.pandas()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
# convert a collection of raw documents into a matrix of TF-IDF features
def tfidf_sim(x,y):
  vectorizer = TfidfVectorizer()
  X = vectorizer.fit_transform([x,y]) #compute the TF-IDF weights
  arr = X.toarray()
  return cosine_similarity(arr)[0][1]

In [5]:
# compute the ROUGE-2 score
# metric to evaluate the quality of generated summary 
def get_rouge(x,y):
  scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
  scores = scorer.score(x,y)
  return round(float(str(scores).split("fmeasure=")[1].split(')}')[0]),2)

In [6]:
# create a Word document with the specified title and add the summarized text to the document
def write_docx(title, text):
  document = Document()
  style = document.styles['Normal']
  font = style.font
  font.name = 'Times New Roman'
  font.size = Pt(8)

  paragraph = document.add_paragraph(text) # adding new paragraph to the document
  paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
  paragraph.style = document.styles['Normal']
  document.save(title)

In [7]:
# filter out any sentences that do not contain subject-verb-object triplet
def filter_triplet(final_text):
    final_text = get_unique_text(final_text) # remove duplicate text
    doc = spacy_nlp(final_text)
    valid_sents = []

    for s in list(doc.sents):  
        if syntax_full(s):
            valid_sents.append(s.text)
    
    final_text = ' '.join(valid_sents)
    return final_text

In [8]:
# remove duplicate sentences
def get_unique_text(document):
    unique_sentences = []
    # split document into individual sentences
    for sentence in [sent.raw for sent in TextBlob(document).sentences]:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return ' '.join(unique_sentences)

In [9]:
# check if the input sentence is valid
def syntax_full(spacy_sentence):
    result=[]
    for token in spacy_sentence:
        # checking dependency relation
        # check if dependency is nominal sunject or passive nominal subject
        if (token.dep == nsubj or token.dep == nsubjpass) and token.head.pos == VERB:
            result.append(token.head)
    if result:
        return True
    else:
        return False

### Language Model

In [10]:
spacy_nlp = spacy.load('en_core_web_sm')   # trained pipeline to perform tok2vec, tagger, parser, lemmatizer
spacy_nlp.max_length = 50000000

In [11]:
matches = ["Fig ", "Fig.", "Figure ", "fig ", "figure "]

### Insert arXiv ID

In [12]:
arxiv_id = input("Enter arXiv ID of the paper: ")

Enter arXiv ID of the paper:  2102.12128


In [13]:
# accessing PDF files on arXiv id
arxiv_url = 'https://export.arxiv.org/pdf/' + str(arxiv_id) + '.pdf'

### Add Abstract and Conclusions

In [14]:
# retrieve the abstract for a paper on arXiv using its ID
search = arxiv.Search(id_list=[arxiv_id])
paper = next(search.results())
abstract = paper.summary

In [15]:
print(abstract)

Large-scale question-answer (QA) pairs are critical for advancing research
areas like machine reading comprehension and question answering. To construct
QA pairs from documents requires determining how to ask a question and what is
the corresponding answer. Existing methods for QA pair generation usually
follow a pipeline approach. Namely, they first choose the most likely candidate
answer span and then generate the answer-specific question. This pipeline
approach, however, is undesired in mining the most appropriate QA pairs from
documents since it ignores the connection between question generation and
answer extraction, which may lead to incompatible QA pair generation, i.e., the
selected answer span is inappropriate for question generation. However, for
human annotators, we take the whole QA pair into account and consider the
compatibility between question and answer. Inspired by such motivation, instead
of the conventional pipeline approach, we propose a model named OneStop
generat

### Extract pdf content

In [16]:
%%time
pdf_file = parser.from_file(arxiv_url) # parse the contents of the PDF
corpus = pdf_file["content"].replace('.\n\n', '.###').replace('?\n\n', '.###').replace('!\n\n', '.###')

2023-04-27 06:15:03,386 [MainThread  ] [INFO ]  Retrieving https://export.arxiv.org/pdf/2102.12128.pdf to /tmp/pdf-2102.12128.pdf.
2023-04-27 06:15:04,211 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
2023-04-27 06:15:04,659 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
2023-04-27 06:15:05,070 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


CPU times: user 296 ms, sys: 260 ms, total: 556 ms
Wall time: 53.3 s


### Filter by triplets

In [17]:
%%time
# split the corpus into a list of individual sentences
# the filter_triplet function is called to filter out any sentences that do not contain subject-verb-object triplet
corpus = ' '.join([filter_triplet(i.strip()) for i in tqdm(sent_tokenize(corpus))])

100%|██████████| 384/384 [00:04<00:00, 93.81it/s] 

CPU times: user 4.12 s, sys: 804 µs, total: 4.12 s
Wall time: 4.13 s





### Leave passages without figures

In [18]:
core_passages = [] 

for j in tqdm(corpus.split('.###')):
  
  if any(x in j for x in matches):
    continue
  # check if the sentence contains at least three individual sentences
  if len(sent_tokenize(j.replace('\n', ' '))) >= 3:
      core_passages.append(j.replace('\n', ' '))

df_score = pd.DataFrame(zip(core_passages), columns=['Passage'])
df_score['Abstract'] = str(abstract)

100%|██████████| 80/80 [00:00<00:00, 6250.59it/s]


### Order by Similarity

In [19]:
# maps sentences & paragraphs to a dense vector space 
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
model

Downloading (…)5fedf/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)2cb455fedf/README.md:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading (…)b455fedf/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)edf/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5fedf/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)fedf/train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading (…)2cb455fedf/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)455fedf/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [20]:
# encode the passages and the abstract as vectors
df_score['Passage_Embedding'] = df_score['Passage'].apply(lambda x: model.encode(x))
df_score['Abstract_Embedding'] = df_score['Abstract'].apply(lambda x: model.encode(x))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
# calculates similarity score between abstract and collection of passages using metrics such as rouge score, TF-IDF score and cosine similarity score
df_score['Score_Rouge'] = df_score['Passage'].apply(lambda x: get_rouge(x, abstract))
df_score['Score_TFIDF'] = df_score['Passage'].apply(lambda x: tfidf_sim(x, abstract))
df_score['Score_Cos'] = df_score.apply(lambda x: (1-spatial.distance.cosine(x['Abstract_Embedding'], x['Passage_Embedding'])), axis=1)

In [22]:
df_score['Score_Rouge']

0     0.85
1     0.20
2     0.15
3     0.14
4     0.15
5     0.07
6     0.07
7     0.06
8     0.05
9     0.06
10    0.05
11    0.05
12    0.08
13    0.06
14    0.05
15    0.11
16    0.05
17    0.11
18    0.01
19    0.08
20    0.03
21    0.07
22    0.14
23    0.00
24    0.00
Name: Score_Rouge, dtype: float64

In [23]:
df_score['Score_TFIDF']

0     0.952276
1     0.756894
2     0.494850
3     0.677678
4     0.632871
5     0.529075
6     0.468140
7     0.441199
8     0.420067
9     0.466455
10    0.422970
11    0.450905
12    0.548434
13    0.429409
14    0.293894
15    0.562514
16    0.303335
17    0.559393
18    0.370932
19    0.505428
20    0.383041
21    0.340948
22    0.532620
23    0.045688
24    0.055858
Name: Score_TFIDF, dtype: float64

In [24]:
df_score['Score_Cos']

0     0.894878
1     0.760369
2     0.659215
3     0.725744
4     0.683959
5     0.650593
6     0.463585
7     0.289481
8     0.264520
9     0.445725
10    0.277232
11    0.286363
12    0.677870
13    0.487526
14    0.391730
15    0.431717
16    0.567871
17    0.424003
18    0.222139
19    0.477612
20    0.428040
21    0.530358
22    0.685693
23    0.009157
24    0.007378
Name: Score_Cos, dtype: float64

In [25]:
# final scores are calculated as an average of the three similarity scores
# passages are ranked based on their scores 
df_score['Score'] = (df_score['Score_Rouge'] + df_score['Score_Cos'] + df_score['Score_TFIDF'])/3

In [26]:
df_score['Score']

0     0.899051
1     0.572421
2     0.434688
3     0.514474
4     0.488943
5     0.416556
6     0.333908
7     0.263560
8     0.244862
9     0.324060
10    0.250067
11    0.262423
12    0.435435
13    0.325645
14    0.245208
15    0.368077
16    0.307069
17    0.364465
18    0.201024
19    0.354347
20    0.280360
21    0.313768
22    0.452771
23    0.018282
24    0.021079
Name: Score, dtype: float64

In [27]:
# The passages with scores above threshold are selected and returned as the final output
df_score = df_score.sort_values('Score', ascending=False)
df_score = df_score[df_score['Score'] > 0]
df_score.reset_index(inplace=True)

### Extract summary

In [28]:
core_passages = []
length = 7   # determines length of summary

# takes the top-ranked passages from df_score and selects based on length requirement
for i in tqdm(list(df_score['Passage'])):
  if len(word_tokenize(' '.join(core_passages) + ' ' + str(i))) < 200*length:
    core_passages.append(i)

summary = '.\n\n'.join(core_passages) + '.'
summary = summary.strip()

100%|██████████| 25/25 [00:00<00:00, 101.36it/s]


### Write summaries to docx and pdf

In [29]:
# generates a summary of a research paper and saves it in two formats - .docx and .pdf
write_docx('summary.docx', summary)
doc = aw.Document("summary.docx")
doc.save("summary.pdf")

<aspose.words.saving.SaveOutputParameters object at 0x70d853bbbcf0>