# Process CORD-19 Data

In [None]:
!python --version
!pip list -v 

The code below downloads the cord-19 dataset and extracts the final files into cord_19_embeddings and document_parses. Note that it deletes the zipped files as it goes otherwise we would run out of memory.

We want to extract the abstract statements from all documents.


In [None]:
!wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2021-07-26.tar.gz cord-19.tar.gz

--2021-08-04 00:01:08--  https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2021-07-26.tar.gz
Resolving ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com (ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com)... 52.218.197.89
Connecting to ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com (ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com)|52.218.197.89|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11255093164 (10G) [binary/octet-stream]
Saving to: ‘cord-19_2021-07-26.tar.gz’


2021-08-04 00:09:28 (21.5 MB/s) - ‘cord-19_2021-07-26.tar.gz’ saved [11255093164/11255093164]

--2021-08-04 00:09:28--  http://cord-19.tar.gz/
Resolving cord-19.tar.gz (cord-19.tar.gz)... failed: Name or service not known.
wget: unable to resolve host address ‘cord-19.tar.gz’
FINISHED --2021-08-04 00:09:28--
Total wall clock time: 8m 20s
Downloaded: 1 files, 10G in 8m 19s (21.5 MB/s)


In [None]:
import tarfile
import os

my_tar = tarfile.open('cord-19_2021-07-26.tar.gz')
my_tar.extractall('./cord-19') # specify which folder to extract to
my_tar.close()
os.remove('cord-19_2021-07-26.tar.gz') # deleting as we go to save memory from reaching capacity 

my_tar = tarfile.open('./cord-19/2021-07-26/cord_19_embeddings.tar.gz')
my_tar.extractall('./cord_19_embeddings') # specify which folder to extract to
my_tar.close()
os.remove('./cord-19/2021-07-26/cord_19_embeddings.tar.gz')

my_tar = tarfile.open('./cord-19/2021-07-26/document_parses.tar.gz')
my_tar.extractall('./') # specify which folder to extract to
my_tar.close()
os.remove('./cord-19/2021-07-26/document_parses.tar.gz')

In [None]:
!pip install tdqm
from tqdm import tqdm

Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
Building wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py) ... [?25l[?25hdone
  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1322 sha256=92bcd0b528db6b469ca97f36f4d5ef472eee54210b2d5eedbb5f934c41a52bca
  Stored in directory: /root/.cache/pip/wheels/c6/f0/d9/9fa5ff78c0f9d5a0a427bbbb4893c283520ddfccb885ea2205
Successfully built tdqm
Installing collected packages: tdqm
Successfully installed tdqm-0.0.1


In [None]:
import csv
import os
import json

abstract = open('abstract.csv', 'wt', newline = '')
abstract_writer = csv.writer(abstract, delimiter = ',')
abstract_writer.writerow(['paper_id', 'abstract'])

directory = r'./document_parses/pdf_json'
for filename in tqdm(os.listdir(directory)):
  f = open(os.path.join(directory, filename))
  doc = json.load(f)
  print(doc.keys())
  print(doc['metadata'])
  print(doc['body_text'])
  for abst in doc['abstract']:
    abstract_writer.writerow([doc['paper_id'], abst['text']])
  break
  # print('*****************************')


  0%|          | 0/240590 [00:00<?, ?it/s]

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])
{'title': 'Could COVID-19 represent a negative prognostic factor in patients with stroke?', 'authors': [{'first': 'Antonio', 'middle': [], 'last': 'Siniscalchi', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Annunziata Hospital of Cosenza', 'location': {'settlement': 'Cosenza', 'country': 'Italy'}}, 'email': ''}, {'first': 'Luca', 'middle': [], 'last': 'Gallelli', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'University of Catanzaro', 'location': {'settlement': 'Catanzaro', 'country': 'Italy'}}, 'email': ''}]}
[{'text': 'To the Editor-Coronavirus infectious disease 2019 (COVID-19) is a highly contagious disease that has become a worldwide pandemic. Coronaviruses (CoVs), positive-stranded RNA viruses, are known to cause respiratory or intestinal infections in humans and animals. 1 Coronaviruses are known to affect the cardiovascular system. 2 The 




In [None]:
import csv
import os
import json

# open the file
count = 0
# num_lines = sum(1 for line in open('cord-19/2021-07-26/metadata.csv','r'))
num_lines = 718493
with open('cord-19/2021-07-26/metadata.csv') as f_in:
    reader = csv.DictReader(f_in)

    abstract = open('abstract_intro_conc.csv', 'wt', newline = '')
    abstract_writer = csv.writer(abstract, delimiter = ',')
    abstract_writer.writerow(['cord_uid', 'title', 'abstract', 'introduction', 'conclusion'])

    for row in tqdm(reader, total = num_lines, position = 0, leave = True):
        # access some metadata
        cord_uid = row['cord_uid']
        title = row['title']
        abstract = row['abstract']

        # access the full text (if available) for Intro
        introduction = []
        conclusion = []
        if row['pdf_json_files']:
            # NOTE: We might have multiple json files for the paper e.g. if paper is large 
            for json_path in row['pdf_json_files'].split('; '): 
                with open(json_path) as f_json:
                    # load the full CORD-19 paper 
                    full_text_dict = json.load(f_json)
                    
                    # grab introduction and conclusion section from *some* version of the full text
                    for paragraph_dict in full_text_dict['body_text']:
                        section_name = paragraph_dict['section']

                        # NOTE: Each paragraph has own section so if intro/conc has multiple paragraphs then 
                        # we have list length = number of paragraphs 
                        if 'intro' in section_name.lower():
                            introduction.append(paragraph_dict['text'])

                        if 'concl' in section_name.lower() or 'discu' in section_name.lower():
                            conclusion.append(paragraph_dict['text'])

                    # stop searching other copies of full text if already got introduction
                    if introduction and conclusion:
                        break

        # # NOTE: Create row for each combination of paragraphs between intro and conc
        # for intro_para in introduction:
        #   for conc_para in conclusion:
        #     abstract_writer.writerow([cord_uid, title, abstract, intro_para, conc_para])
        # NOTE: Lol this creates a 11 gb file ... 

        # Add <PARAGRAPH> as a separator so we know where it is when we join as string 
        abstract_writer.writerow([cord_uid, title, abstract, '<PARAGRAPH>'.join(introduction), '<PARAGRAPH>'.join(conclusion)])

100%|█████████▉| 718492/718493 [10:10<00:00, 1176.02it/s]


In [None]:
# 1.9gb file 
import pandas as pd
data = pd.read_csv('abstract_intro_conc.csv')

In [None]:
# split file into 3 parts to upload 
# the google upload fails because 1.9gb file 
part_1 = int(len(data)/2)
part_2 = int(part_1 + part_1/2)

In [None]:
data.iloc[0:part_1].to_csv('abstract_intro_conc_part_1.csv', index = False)
data.iloc[part_1:part_2].to_csv('abstract_intro_conc_part_2.csv', index = False)
data.iloc[part_2:].to_csv('abstract_intro_conc_part_3.csv', index = False)

In [None]:
!pip install --upgrade gupload

from google.colab import auth

# Authenticate and create the PyDrive client.
auth.authenticate_user()

!gupload --to '1W9R77oTk_DAMxfSjKGOLNj5xFtoC7gLF' abstract_intro_conc_part_1.csv
!gupload --to '1W9R77oTk_DAMxfSjKGOLNj5xFtoC7gLF' abstract_intro_conc_part_2.csv
!gupload --to '1W9R77oTk_DAMxfSjKGOLNj5xFtoC7gLF' abstract_intro_conc_part_3.csv

Uploading file: abstract_intro_conc_part_1.csv
tcmalloc: large alloc 1389166592 bytes == 0x55daa992c000 @  0x7ffb370261e7 0x55da55560a18 0x55da55577e06 0x55da55577ae8 0x55da55580860 0x55da55586f31 0x55da55589fc5 0x55da5552c713 0x55da5552efcc 0x55da55620c0d 0x55da555a30d8 0x55da5559dc35 0x55da5553073a 0x55da5559ed67 0x55da5559e235 0x55da5553073a 0x55da5559f93b 0x55da5559e235 0x55da5553073a 0x55da5559eb0e 0x55da5559e235 0x55da5546fe2c 0x55da555a0318 0x55da5559dc35 0x55da55530fec 0x55da555311f1 0x55da555a0318 0x55da5553065a 0x55da5559ed67 0x55da5559dc35 0x55da55530dd1
tcmalloc: large alloc 2083741696 bytes == 0x55dafc5fc000 @  0x7ffb370261e7 0x55da55560a18 0x55da5552b987 0x55da55641f51 0x55da5552c7ad 0x55da5558aaa2 0x55da55589c43 0x55da5552efcc 0x55da55620c0d 0x55da555a30d8 0x55da5553065a 0x55da5559eb0e 0x55da5559ddcc 0x55da5553073a 0x55da5559eb0e 0x55da5553065a 0x55da5559ed67 0x55da5553065a 0x55da5559ed67 0x55da5559dc35 0x55da5553073a 0x55da5559f93b 0x55da5553065a 0x55da5559eb0e 0x55da55

# Summarization 

We are generating synthetic data using the summarization method.  
- For each abstract, use a summarization model to generate the answer.  
- 

In [None]:
# Download processed abstract data
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1PLtGzINFifKS1g2JOA1xCLcIhzkRYuTO' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1PLtGzINFifKS1g2JOA1xCLcIhzkRYuTO" -O abstract_intro_conc_part_1.csv && rm -rf /tmp/cookies.txt
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1RPtgVVrtLWSUQyuhftdauP6geKZxjEBN' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1RPtgVVrtLWSUQyuhftdauP6geKZxjEBN" -O abstract_intro_conc_part_2.csv && rm -rf /tmp/cookies.txt
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=11dpbHv7_foyyQhyBeg9hLOTXMusYtb3j' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=11dpbHv7_foyyQhyBeg9hLOTXMusYtb3j" -O abstract_intro_conc_part_3.csv && rm -rf /tmp/cookies.txt

In [None]:
import pandas as pd
abstract_intro_conc_part_1 = pd.read_csv('abstract_intro_conc_part_1.csv')
abstract_intro_conc_part_2 = pd.read_csv('abstract_intro_conc_part_2.csv')
abstract_intro_conc_part_3 = pd.read_csv('abstract_intro_conc_part_3.csv')
abstract_intro_conc_part_1 = abstract_intro_conc_part_1.dropna()
abstract_intro_conc_part_2 = abstract_intro_conc_part_2.dropna()
abstract_intro_conc_part_3 = abstract_intro_conc_part_3.dropna()

In [None]:
!pip install transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained('lrakotoson/scitldr-catts-xsum-ao')
model = AutoModelForSeq2SeqLM.from_pretrained('lrakotoson/scitldr-catts-xsum-ao')
device = "cuda:0"
model = model.to(device)

In [None]:
!pip install tdqm
from tqdm import tqdm

Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
Building wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py) ... [?25l[?25hdone
  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1322 sha256=76aab4815a8f79e7fdeb3fabb81201cd0859aa89bd36130d67746953b6495655
  Stored in directory: /root/.cache/pip/wheels/c6/f0/d9/9fa5ff78c0f9d5a0a427bbbb4893c283520ddfccb885ea2205
Successfully built tdqm
Installing collected packages: tdqm
Successfully installed tdqm-0.0.1


In [None]:
!nvidia-smi

Wed Aug  4 23:11:18 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    28W /  70W |   2626MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
def createSummaries(df, n = 10000, chunkSize = 10, use_pipeline = True):
    import math
    chunkSize = chunkSize
    numberChunks = math.ceil(n / chunkSize) 
    summary = []
    if use_pipeline:
        from transformers import pipeline 
        summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=0) # Device = 0 allows us to use the GPU
        for i in tqdm(range(numberChunks), position = 0, leave = True):
            summary.extend(summarizer(df.head(n = n).iloc[i*chunkSize:(i+1)*chunkSize]['abstract'].to_list(), truncation = True))
        return summary
    else: 
        for i in tqdm(range(n), position = 0, leave = True):
            inputs = tokenizer.encode("summarize: " + df.iloc[i]['abstract'], return_tensors = "pt", max_length = 512, truncation = True).to(0)
            outputs = model.generate(inputs, length_penalty=2.0, num_beams=4, early_stopping=True)
            summary.append(tokenizer.decode(outputs[0]))
        return summary


In [None]:
summaries = createSummaries(abstract_intro_conc_part_1, len(abstract_intro_conc_part_1))

In [None]:
abstract_intro_conc_part_1['answer'] = [s['summary_text'] for s in summaries]
abstract_intro_conc_part_1.to_csv("abstract_intro_conc_ans_part_1.csv")

In [None]:
summaries = createSummaries(abstract_intro_conc_part_2, len(abstract_intro_conc_part_2))

100%|██████████| 931/931 [38:08<00:00,  2.46s/it]


In [None]:
abstract_intro_conc_part_2['answer'] = [s['summary_text'] for s in summaries]
abstract_intro_conc_part_2.to_csv("abstract_intro_conc_ans_part_2.csv")

In [None]:
summaries = createSummaries(abstract_intro_conc_part_3, len(abstract_intro_conc_part_3))

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
 64%|██████▍   | 5099/7939 [3:49:31<1:56:22,  2.46s/it]

In [None]:
abstract_intro_conc_part_3['answer'] = [s['summary_text'] for s in summaries]
abstract_intro_conc_part_3.to_csv("abstract_intro_conc_ans_part_3.csv")

In [None]:
!pip install --upgrade gupload

from google.colab import auth

# Authenticate and create the PyDrive client.
auth.authenticate_user()

# !gupload --to '1W9R77oTk_DAMxfSjKGOLNj5xFtoC7gLF' abstract_intro_conc_ans_part_1.csv
# !gupload --to '1W9R77oTk_DAMxfSjKGOLNj5xFtoC7gLF' abstract_intro_conc_ans_part_2.csv
!gupload --to '1W9R77oTk_DAMxfSjKGOLNj5xFtoC7gLF' abstract_intro_conc_ans_part_3.csv

In [None]:
abstract.iloc[4465]['abstract']

'The article analyzes the possibilities of commercialization of scientific research results of the human genome and its positive impact on research activities, provided that the information is properly protected. As shown by the legal regulation of scientific research of the human genome in certain states, the secret of private life (privacy) is the main content of the contract practiced in the field of medical services. Genetic information, although similar to medical secrecy when entering into a contract for the provision of medical services, differs in a public element in connection with the use of digital technologies in scientific research. The requirement to protect genome privacy is a mutual obligation of the parties in the contract. This parties do not always give proper attention to such issue. Legal regulation of genomic research will help to formulate a pragmatic attitude to the problem and find a balance between the risk and benefits of human knowledge.'

In [None]:
# Subsetting the dataset
import math
chunkSize = 10
numberChunks = math.ceil(len(abstract) / chunkSize)
section = range(15000,20000,1)
summary = []
for i in section:
    print(i)
    abstract_chunk_summary = summarizer(abstract.iloc[i*chunkSize:(i+1)*chunkSize]['abstract'].to_list(), truncation = True)
    summary.extend(abstract_chunk_summary)

In [None]:
from google.colab import files

section = range(50000,100000,1)
abstract_sub = abstract.iloc[section,:]
summaries = [x['summary_text'] for x in summary]

abstract_sub['summary'] = summaries

abstract_sub.to_csv('subset.csv')
files.download('subset.csv')

In [None]:
import torch, gc
if 'summarizer' in globals():
  del summarizer
gc.collect()
torch.cuda.empty_cache()

# Synthetic QA Generation

- Use summarized abstract as answers and use original abstract as context documents to generate a question.

In [None]:
# !wget https://raw.githubusercontent.com/AMontgomerie/question_generator/master/questiongenerator.py questiongenerator.py 
# !wget https://raw.githubusercontent.com/AMontgomerie/question_generator/master/setup.py setup.py
# !wget https://raw.githubusercontent.com/AMontgomerie/question_generator/master/run_qg.py run_qg.py
# !wget https://raw.githubusercontent.com/AMontgomerie/question_generator/master/requirements.txt requirements.txt




In [None]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1QpNCgwatG8JNFQLoVm3ClMYo9NvEUoHe' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1QpNCgwatG8JNFQLoVm3ClMYo9NvEUoHe" -O abstract_intro_conc_ans_part_1.csv && rm -rf /tmp/cookies.txt 
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1cBWOPOUsMv6cFTTqu4R8dHBGcbFG_N_v' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1cBWOPOUsMv6cFTTqu4R8dHBGcbFG_N_v" -O abstract_intro_conc_ans_part_2.csv && rm -rf /tmp/cookies.txt 

--2021-08-05 04:45:06--  https://docs.google.com/uc?export=download&confirm=rvfZ&id=1QpNCgwatG8JNFQLoVm3ClMYo9NvEUoHe
Resolving docs.google.com (docs.google.com)... 64.233.166.102, 64.233.166.101, 64.233.166.139, ...
Connecting to docs.google.com (docs.google.com)|64.233.166.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-08-80-docs.googleusercontent.com/docs/securesc/jpf6o1grpoi4sun1k6tl37mkqubkd3n0/osqgiau2v9bsb2hu2ue4p41oovinot1j/1628138700000/03976892977300334194/05960212144191834893Z/1QpNCgwatG8JNFQLoVm3ClMYo9NvEUoHe?e=download [following]
--2021-08-05 04:45:06--  https://doc-08-80-docs.googleusercontent.com/docs/securesc/jpf6o1grpoi4sun1k6tl37mkqubkd3n0/osqgiau2v9bsb2hu2ue4p41oovinot1j/1628138700000/03976892977300334194/05960212144191834893Z/1QpNCgwatG8JNFQLoVm3ClMYo9NvEUoHe?e=download
Resolving doc-08-80-docs.googleusercontent.com (doc-08-80-docs.googleusercontent.com)... 142.250.13.132, 2a00:1450:400c:c03::84
Connecting

In [None]:
import pandas as pd
abstract_intro_conc_ans_part_1 = pd.read_csv('abstract_intro_conc_ans_part_1.csv')
abstract_intro_conc_ans_part_2 = pd.read_csv('abstract_intro_conc_ans_part_2.csv')
abstract_intro_conc_ans_part_1.head()

Unnamed: 0.1,Unnamed: 0,cord_uid,title,abstract,introduction,conclusion,answer
0,0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,Mycoplasma pneumoniae is a common cause of upp...,Mycoplasma pneumoniae is one of the most commo...,Mycoplasma pneumoniae infections at King Abdul...
1,2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,Surfactant protein-D (SP-D) is a member of the...,There is increasing evidence that SP-D interac...,Surfactant Protein-D participates in the innat...
2,4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,RSV and PVM are viruses of the family Paramyxo...,The pneumoviruses RSV and PVM enter respirator...,Respiratory syncytial virus and pneumonia viru...
3,5,zjufx4fo,Sequence requirements for RNA strand transfer ...,Nidovirus subgenomic mRNAs contain a leader se...,The genetic information of RNA viruses is orga...,EAV genome replication is not signi®cantly aff...,Co-variation mutagenesis of equine arteritis v...
4,6,5yhe786e,Debate: Transfusing to normal haemoglobin leve...,Recent evidence suggests that critically ill p...,Anaemia is a common condition in critically il...,The need to reduce the amount of allogeneic bl...,A Restrictive Blood transfusion Strategy for C...


##### Create class for generating QA

In [None]:
!pip install sentencepiece
!pip install transformers
import os
import sys
import math
import numpy as np
import torch
import spacy
import re
import random
import json
import en_core_web_sm
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
)
import sentencepiece


class QuestionGenerator:
    def __init__(self, model_dir=None):

        QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
        self.ANSWER_TOKEN = "<answer>"
        self.CONTEXT_TOKEN = "<context>"
        self.SEQ_LENGTH = 1024

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.qg_tokenizer = AutoTokenizer.from_pretrained(
            QG_PRETRAINED)
        self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
        self.qg_model.to(self.device)

    def generate(
        self, article, answer
    ):

        qg_input = "{} {} {} {}".format(
            self.ANSWER_TOKEN, answer, self.CONTEXT_TOKEN, article
        )

        generated_question = self._generate_question(qg_input)

        qa = self._make_dict(
            generated_question, answer
        )

        return qa

    def _generate_question(self, qg_input):
        self.qg_model.eval()
        encoded_input = self._encode_qg_input(qg_input)
        with torch.no_grad():
            output = self.qg_model.generate(
                input_ids=encoded_input["input_ids"])
        question = self.qg_tokenizer.decode(
            output[0], skip_special_tokens=True)
        return question

    def _encode_qg_input(self, qg_input):
        return self.qg_tokenizer(
            qg_input,
            padding='max_length',
            max_length=self.SEQ_LENGTH,
            truncation=True,
            return_tensors="pt",
        ).to(self.device)

    def _make_dict(self, question, answer):
        qa = {}
        qa["question"] = question
        qa["answer"] = answer
        return qa


def print_qa(qa_list, show_answers=True):
    for i in range(len(qa_list)):
        # wider space for 2 digit q nums
        space = " " * int(np.where(i < 9, 3, 4))

        print("{}) Q: {}".format(i + 1, qa_list[i]["question"]))

        answer = qa_list[i]["answer"]

        # print a list of multiple choice answers
        if type(answer) is list:

            if show_answers:
                print(
                    "{}A: 1.".format(space),
                    answer[0]["answer"],
                    np.where(answer[0]["correct"], "(correct)", ""),
                )
                for j in range(1, len(answer)):
                    print(
                        "{}{}.".format(space + "   ", j + 1),
                        answer[j]["answer"],
                        np.where(answer[j]["correct"] ==
                                 True, "(correct)", ""),
                    )

            else:
                print("{}A: 1.".format(space), answer[0]["answer"])
                for j in range(1, len(answer)):
                    print("{}{}.".format(space + "   ", j + 1),
                          answer[j]["answer"])
            print("")

        # print full sentence answers
        else:
            if show_answers:
                print("{}A:".format(space), answer, "\n")




##### Generate QA

In [None]:
!pip install tdqm
from tqdm import tqdm

Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
Building wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py) ... [?25l[?25hdone
  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1322 sha256=ec6ff718b99ac05b963f52a3a3d79167b769fb2ee9e362fb7b32dc45828ef031
  Stored in directory: /root/.cache/pip/wheels/c6/f0/d9/9fa5ff78c0f9d5a0a427bbbb4893c283520ddfccb885ea2205
Successfully built tdqm
Installing collected packages: tdqm
Successfully installed tdqm-0.0.1


In [None]:
qa_gen = QuestionGenerator()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1208.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=39.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=121.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891612736.0, style=ProgressStyle(descri…




In [None]:
def GenerateSyntheticData(df):
    synthetic_data = []
    for i, ele in tqdm(df.iterrows(), total = df.shape[0]):
        synthetic_data.append(qa_gen.generate(ele['abstract'], ele['answer']))
    for i, qa in enumerate(synthetic_data):
        synthetic_data[i] = qa['question'][0:qa['question'].find('?') + 1]
    return synthetic_data

In [None]:
question = GenerateSyntheticData(abstract_intro_conc_ans_part_1)

In [None]:
abstract_intro_conc_ans_part_1['question'] = question
abstract_intro_conc_ans_part_1.to_csv('synthetic_qa_part_1.csv')

In [None]:
abstract_intro_conc_ans_part_1[abstract_intro_conc_ans_part_1['question'] == '']

Unnamed: 0.1,Unnamed: 0,cord_uid,title,abstract,introduction,conclusion,answer,question
3,5,zjufx4fo,Sequence requirements for RNA strand transfer ...,Nidovirus subgenomic mRNAs contain a leader se...,The genetic information of RNA viruses is orga...,EAV genome replication is not signi®cantly aff...,Co-variation mutagenesis of equine arteritis v...,
8,13,mcuixluu,Vaccinia virus infection disrupts microtubule ...,We examined the role of the microtubule cytosk...,Intracellular bacterial and viral pathogens ha...,The size of virus particles is such that they ...,Microtubule cytoskeleton during vaccinia virus...,
17,30,754nln40,Factors affecting translation at the programme...,The ratio between proteins P27 and replicase o...,The principal mechanism of translation is the ...,Since À1 PRF studies are affected by a huge nu...,Fusion of Cocksfoot Mottle Virus Protein P27 w...,
18,37,cl9gpt9w,The influence of locked nucleic acid residues ...,The influence of locked nucleic acid (LNA) res...,Understanding the thermodynamics of nucleic ac...,Oligonucleotide hybridization to RNA has many ...,Stability of 2′-O-methyl RNA/RNA heteroduplexe...,
26,70,xgwbl8em,Antisense-induced ribosomal frameshifting,Programmed ribosomal frameshifting provides a ...,The standard triplet readout of the genetic co...,Several models attempting to explain pseudokno...,Programmed Ribosomal Frameshifting with Antise...,
...,...,...,...,...,...,...,...,...
23874,355382,y3di1dct,Second quantization approach to COVID-19 epidemic,We show how the standard field theoretical lan...,The year 2020 will be remembered in history as...,"We have investigated, using an SIR-type stocha...",SIR-type stochastic model for COVID-19 epidemi...,
23887,356107,wucowgfq,Trend estimation and short-term forecasting of...,"Since the beginning of the COVID-19 pandemic, ...",It is of utmost importance for governments and...,"Our results show that, for a forecast of the n...",Forecasting the Evolution of Cases and Deaths ...,
23888,356111,5hy7af88,Ribonucleocapsid assembly/packaging signals in...,The genomic ssRNA of coronaviruses is packaged...,"To the middle of July 2020, the COVID-19 pande...",The methods developed in this paper are quite ...,Assembly/Packaging Signals of Coronaviruses an...,
23897,356644,vykvl6vg,Application and Comparison of Deep Learning Me...,mRNA vaccines are receiving increased interest...,"Over the last two decades, there has been incr...",Although the creation of stable mRNA molecules...,Graph Convolutional Networks for Predicting St...,


In [None]:
question = GenerateSyntheticData(abstract_intro_conc_ans_part_2)

In [None]:
abstract_intro_conc_ans_part_2['question'] = question
abstract_intro_conc_ans_part_2.to_csv('synthetic_qa_part_2.csv')
# synthetic_data_df = pd.DataFrame(synthetic_data)

# clean_question = []
# for i, e in synthetic_data_df.iterrows():
#   clean_question.append(e['question'][0:e['question'].find('?') + 1]) 

# synthetic_data_df['question'] = clean_question

# synthetic_data_df['context'] = abstract_intro_conc_ans_part_2['abstract']

# synthetic_data_df.to_csv('synthetic_qa_part_2.csv')

In [None]:
!pip install --upgrade gupload

from google.colab import auth

# Authenticate and create the PyDrive client.
auth.authenticate_user()

!gupload --to '1W9R77oTk_DAMxfSjKGOLNj5xFtoC7gLF' synthetic_qa_part_1.csv
# !gupload --to '1W9R77oTk_DAMxfSjKGOLNj5xFtoC7gLF' synthetic_qa_part_2.csv

Collecting gupload
  Downloading gupload-1.1.0-py3-none-any.whl (4.7 kB)
Collecting click==7.0
  Downloading Click-7.0-py2.py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 5.7 MB/s 
[?25hCollecting google-api-python-client==1.7.10
  Downloading google_api_python_client-1.7.10-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.1 MB/s 
Installing collected packages: google-api-python-client, click, gupload
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 1.12.8
    Uninstalling google-api-python-client-1.12.8:
      Successfully uninstalled google-api-python-client-1.12.8
  Attempting uninstall: click
    Found existing installation: click 7.1.2
    Uninstalling click-7.1.2:
      Successfully uninstalled click-7.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependen

Uploading file: synthetic_qa_part_1.csv


# Using rag-end2end-retriever (Shamane's)

In [None]:
!git clone https://github.com/Zefty/rag-end2end-retriever.git

Cloning into 'rag-end2end-retriever'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 28 (delta 7), reused 28 (delta 7), pack-reused 0[K
Unpacking objects: 100% (28/28), done.


In [None]:
# Install dependencies 
!pip install -r /content/rag-end2end-retriever/requirements.txt

In [None]:
# Run fine-tuning script 
%cd /content/rag-end2end-retriever
!chmod 755 /content/rag-end2end-retriever/test_run/test_finetune.sh
!/content/rag-end2end-retriever/test_run/test_finetune.sh