In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
# Install tdqm which is used to see the progress of a cell
!pip install tdqm
from tqdm import tqdm

Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
Building wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py) ... [?25l[?25hdone
  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1323 sha256=5ee719d6eaf5e17b956a3d8cd46b49957388b9fcaf7686ab4b3271845684043c
  Stored in directory: /root/.cache/pip/wheels/c6/f0/d9/9fa5ff78c0f9d5a0a427bbbb4893c283520ddfccb885ea2205
Successfully built tdqm
Installing collected packages: tdqm
Successfully installed tdqm-0.0.1


### Summarising contexts in CORD-19 Dump

**Why are we doing this?**
- By only including summaries of passages we can make the knowledge base more concise. By doing this the DPR may have an easier time finding the answers it is after.
- The over-arching idea is to improve RAG performance by preprocessing the knowledge base so that the DPR has an easier time.

**Limitations:**
- Shamane mentioned when we use a summariser on domain knowledge we can lose information when jargon is used, since the summariser has not been trainined on the domain before. We could fine-tune the summariser on the domain first but that would be computationally expensive.

In [None]:
# Download Covid data full
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ZeOqiN4duXO0IO_TMQHpiAar3AJUhziR' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1ZeOqiN4duXO0IO_TMQHpiAar3AJUhziR" -O covid_dump.csv && rm -rf /tmp/cookies.txt

--2021-08-18 08:28:16--  https://docs.google.com/uc?export=download&confirm=&id=1ZeOqiN4duXO0IO_TMQHpiAar3AJUhziR
Resolving docs.google.com (docs.google.com)... 108.177.121.102, 108.177.121.138, 108.177.121.113, ...
Connecting to docs.google.com (docs.google.com)|108.177.121.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0c-c0-docs.googleusercontent.com/docs/securesc/55b635bh751uk7gjql36fn9psj5algjo/i7udk3rps5vjvmutqfff82ip4mo30m81/1629275250000/01035779431551219368/13733675730190601309Z/1ZeOqiN4duXO0IO_TMQHpiAar3AJUhziR?e=download [following]
--2021-08-18 08:28:20--  https://doc-0c-c0-docs.googleusercontent.com/docs/securesc/55b635bh751uk7gjql36fn9psj5algjo/i7udk3rps5vjvmutqfff82ip4mo30m81/1629275250000/01035779431551219368/13733675730190601309Z/1ZeOqiN4duXO0IO_TMQHpiAar3AJUhziR?e=download
Resolving doc-0c-c0-docs.googleusercontent.com (doc-0c-c0-docs.googleusercontent.com)... 142.250.1.132, 2607:f8b0:4001:c24::84
Connecting 

In [None]:
# Read in the full covid dataset
covid_dump = pd.read_csv("covid_dump.csv",sep='\t',header=0, names=['title','context'])

In [None]:
print(len(covid_dump.index))
covid_dump.head()

33929


Unnamed: 0,title,context
0,Clinical features of culture-proven Mycoplasma...,KAUH is a tertiary care teaching hospital with...
1,Clinical features of culture-proven Mycoplasma...,"During the study period, respiratory specimens..."
2,Clinical features of culture-proven Mycoplasma...,M. pneumoniae was cultured using the classic M...
3,Clinical features of culture-proven Mycoplasma...,M. pneumoniae isolates were considered communi...
4,Clinical features of culture-proven Mycoplasma...,Pneumonia was diagnosed based on clinical symp...


In [None]:
!nvidia-smi

In [None]:
# Installing transformers and selecting summarising models
!pip install transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained('lrakotoson/scitldr-catts-xsum-ao')
model = AutoModelForSeq2SeqLM.from_pretrained('lrakotoson/scitldr-catts-xsum-ao')
device = "cuda:0"
model = model.to(device)

In [None]:
def createSummaries(df, n = 10000, chunkSize = 10, use_pipeline = True):
    import math
    chunkSize = chunkSize
    numberChunks = math.ceil(n / chunkSize) 
    summary = []
    if use_pipeline:
        from transformers import pipeline 
        summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=0) # Device = 0 allows us to use the GPU
        for i in tqdm(range(numberChunks), position = 0, leave = True):
            summary.extend(summarizer(df.head(n = n).iloc[i*chunkSize:(i+1)*chunkSize]['context'].to_list(), truncation = True))
        return summary
    else: 
        for i in tqdm(range(n), position = 0, leave = True):
            inputs = tokenizer.encode("summarize: " + df.iloc[i]['context'], return_tensors = "pt", max_length = 512, truncation = True).to(0)
            outputs = model.generate(inputs, length_penalty=2.0, num_beams=4, early_stopping=True)
            summary.append(tokenizer.decode(outputs[0]))
        return summary

In [None]:
subset = covid_dump.iloc[30000:,:]

In [None]:
from google.colab import files
summaries = createSummaries(subset, 3929)

In [None]:
summaries = [x['summary_text'] for x in summaries]
subset['summary'] = summaries
subset.to_csv('subset.csv')
files.download('subset.csv')

### Summarising only Abstracts in CORD-19 Dump

In [None]:
# Download Covid data full
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1zjw7U1bufzIU1j8HaW7NvkGNn9myYiDb' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1zjw7U1bufzIU1j8HaW7NvkGNn9myYiDb" -O covid_data_full.csv && rm -rf /tmp/cookies.txt

In [None]:
cord_19 = pd.read_csv("covid_data_full.csv")

In [None]:
print(len(cord_19.index))
cord_19.head()

170883


Unnamed: 0,_id,title,abstract,text
0,a06cef99d8a6ba4a4757f0d707745cf9f482b1ea,Combining the use of Nuss procedure and rib fi...,Background: Severe flail chest is a life-threa...,"Flail chest, caused by multiple consecutive ri..."
1,01dd44604e936f7480e56971fed3979db3b910b5,A topology-based network tree for the predicti...,The ability to predict protein-protein interac...,P rotein-protein interactions (PPIs) are cruci...
2,2d658bbdcbf5903f59952843897a5b43d7eb96c9,ARTICLE IN PRESS +Model,,Anxiété ; Virus SARS-Cov2 ; COVID19 ; Internes...
3,7afeb12a48bbbe2aa4c94f3d27e35042c60607c1,Accepted Article Clinical and demographic char...,,Three leading factors are reported to strongly...
4,d3934f5851468e4e79eda707821419e763158e1a,Preventing SARS-CoV-2 In-Hospital Infections i...,,Clustering of a severe acute respiratory distr...


In [None]:
# Dropping all rows with no Abstract
cord_19 = cord_19.dropna(axis=0, subset=['abstract']).loc[:,['_id','title','abstract']]
print(len(cord_19.index))
cord_19.columns

In [None]:
cord_19_seg = {'_id':[], 'title':[], 'abstract':[]}

# Breaking abstracts down to 400 word segments
for index, row in cord_19.iterrows():
  word_ct = len(row.abstract.split())
  if word_ct > 400:
    chunks = [row.abstract.split()[i:i+400] for i in range(0, word_ct, 400)] # Separating abstract
    for i in chunks:
      cord_19_seg['_id'].append(row._id)
      cord_19_seg['title'].append(row.title)
      cord_19_seg['abstract'].append(' '.join(i))
  else:
      cord_19_seg['_id'].append(row._id)
      cord_19_seg['title'].append(row.title)
      cord_19_seg['abstract'].append(row.abstract)

cord_19_seg = pd.DataFrame(cord_19_seg)

In [None]:
# Downloading the processed cord_19 data
from google.colab import files
cord_19_seg.to_csv('cord_19_segmented.csv')
files.download('cord_19_segmented.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# Download full covid data processed
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1gsgEO76jblRWfjB8sgfUmFWmhr66hmI5' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1gsgEO76jblRWfjB8sgfUmFWmhr66hmI5" -O cord_19_segmented.csv && rm -rf /tmp/cookies.txt

--2021-08-29 23:35:51--  https://docs.google.com/uc?export=download&confirm=pUkN&id=1gsgEO76jblRWfjB8sgfUmFWmhr66hmI5
Resolving docs.google.com (docs.google.com)... 209.85.146.113, 209.85.146.138, 209.85.146.101, ...
Connecting to docs.google.com (docs.google.com)|209.85.146.113|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-04-b8-docs.googleusercontent.com/docs/securesc/19kpiogbnlptu40g2b97hmd3c1nm1rjq/b2mgtvq1rsa2jptefpiltrn6j8kc29lo/1630280100000/04844237025276582440/01874928848046080583Z/1gsgEO76jblRWfjB8sgfUmFWmhr66hmI5?e=download [following]
--2021-08-29 23:35:52--  https://doc-04-b8-docs.googleusercontent.com/docs/securesc/19kpiogbnlptu40g2b97hmd3c1nm1rjq/b2mgtvq1rsa2jptefpiltrn6j8kc29lo/1630280100000/04844237025276582440/01874928848046080583Z/1gsgEO76jblRWfjB8sgfUmFWmhr66hmI5?e=download
Resolving doc-04-b8-docs.googleusercontent.com (doc-04-b8-docs.googleusercontent.com)... 173.194.195.132, 2607:f8b0:4001:c11::84
Connectin

In [None]:
# Installing transformers and selecting summarising models
!pip install transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained('lrakotoson/scitldr-catts-xsum-ao')
model = AutoModelForSeq2SeqLM.from_pretrained('lrakotoson/scitldr-catts-xsum-ao')
device = "cuda:0"
model = model.to(device)

In [4]:
def createSummaries(df, n = 10000, chunkSize = 10, use_pipeline = True):
    import math
    chunkSize = chunkSize
    numberChunks = math.ceil(n / chunkSize) 
    summary = []
    if use_pipeline:
        from transformers import pipeline 
        summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=0) # Device = 0 allows us to use the GPU
        for i in tqdm(range(numberChunks), position = 0, leave = True):
            summary.extend(summarizer(df.head(n = n).iloc[i*chunkSize:(i+1)*chunkSize]['abstract'].to_list(), truncation = True))
        return summary
    else: 
        for i in tqdm(range(n), position = 0, leave = True):
            inputs = tokenizer.encode("summarize: " + df.iloc[i]['abstract'], return_tensors = "pt", max_length = 512, truncation = True).to(0)
            outputs = model.generate(inputs, length_penalty=2.0, num_beams=4, early_stopping=True)
            summary.append(tokenizer.decode(outputs[0]))
        return summary

In [11]:
cord_19_seg = pd.read_csv('cord_19_segmented.csv')
cord_19_seg.columns

Index(['Unnamed: 0', '_id', 'title', 'abstract'], dtype='object')

In [12]:
cord_19_seg.head()

Unnamed: 0.1,Unnamed: 0,_id,title,abstract
0,0,a06cef99d8a6ba4a4757f0d707745cf9f482b1ea,Combining the use of Nuss procedure and rib fi...,Background: Severe flail chest is a life-threa...
1,1,01dd44604e936f7480e56971fed3979db3b910b5,A topology-based network tree for the predicti...,The ability to predict protein-protein interac...
2,2,04a74daf787afbfd655eb007a0ceb7c786be3f4f,Point-of-care lung ultrasound for the assessme...,In the coronavirus disease-2019 (COVID-19) era...
3,3,d92e305bcea5be1e84643b9998c81c70ba72b6b4,The biological potential of the raccoon dog (N...,Invasive wildlife species have the potential t...
4,4,142a615ffb970d12beaa9597bff2b9c49da4bb96,Risk factors for severe acute lower respirator...,Aim To identify the risk factors in children u...


In [None]:
subset = cord_19_seg.iloc[50000:100000,:]

from google.colab import files
summaries = createSummaries(subset, 50000)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
 73%|███████▎  | 3647/5000 [6:13:23<2:03:10,  5.46s/it]

In [None]:
#summaries = [x['summary_text'] for x in summaries]
subset['summary'] = summaries
subset.to_csv('subset.csv')
files.download('subset.csv')