In [1]:
import os
import json
from datetime import datetime
from datetime import timedelta
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
!pip install transformers
import torch
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')



In [2]:
clean_pdf_df = pd.read_csv("/home/jupyter/covid19-challenge/data/clean_doc_pdf.csv")

In [3]:
clean_pdf_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,1d7234df1a19a0cf0b4038ceb31e7a97881cdbd7,Virtual application of in situ simulation duri...,"Erich Hanel, E M , Monika Bilic, Kelly Hassall...","Erich Hanel, E M , Monika Bilic, Kelly Hassall...",Abstract\n\nThe coronavirus disease 2019 (COVI...,\n\nRÉSUMÉ L'arrivée de la pandémie causée par...,In situ simulation and its effects on patient ...,"[{'first': 'Erich', 'middle': [], 'last': 'Han...","{'BIBREF0': {'ref_id': 'b0', 'title': 'In situ..."
1,0ced1f946cce007aa319a0ba38aef2c4b14dab0e,BMC Immunology Identification of a novel conse...,"Yanbo Lv, Zhihua Ruan, Li Wang, Bing Ni, Yuzha...","Yanbo Lv (Third Military Medical University, 3...",Abstract\n\nThe spike (S) protein is a major s...,Background\n\nSevere acute respiratory syndrom...,Identification of a novel coronavirus in patie...,"[{'first': 'Yanbo', 'middle': [], 'last': 'Lv'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Identif..."
2,2a667552f5924857d6eaf9a8104e5f92e236dde9,Emerging Infectious Diseases: A Historical and...,"Gibril Ndow, J Radeino Ambe, Oyewale Tomori","Gibril Ndow, J Radeino Ambe, Oyewale Tomori",,\n\nguide for their recognition and prevention...,"History of yellow fever, G Augustine, , 1909; ...","[{'first': 'Gibril', 'middle': [], 'last': 'Nd...","{'BIBREF0': {'ref_id': 'b0', 'title': 'History..."
3,2f57fc005f74c28402fe57d88144a855ee09efb2,Internet of Things (IoT) applications to fight...,"Ravi Pratap Singh, Mohd Javaid, Abid Haleem, R...",Ravi Pratap Singh (Ambedkar National Institute...,,Introduction\n\nThe Internet of Things (IoT) i...,Internet of things (IoT) applications in ortho...,"[{'first': 'Ravi', 'middle': ['Pratap'], 'last...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Interne..."
4,238a7184d9d9876b23b92b957489b91c98365c39,,,,,"\n\npathogenesis of CIA, we have estabbshed T ...",We utilized the T-cell activating .superantige...,[],"{'BIBREF1': {'ref_id': 'b1', 'title': ""We util..."


In [46]:
text = list(clean_pdf_df['text'])[:5000]

## Convert to Corpus

In [15]:
tfidf = TfidfVectorizer()
corpus_tfidf_matrix = tfidf.fit_transform(text)
corpus_tfidf_matrix.shape

(5000, 305977)

In [16]:
questions = [
  'Development of a point-of-care test and rapid bed-side tests',
  'Diagnosing SARS-COV-2 with Nucleic-acid based tech',
  'Diagnosing SARS-COV-2 with antibodies'
]
test_tfidf_matrix = tfidf.transform(questions)
test_tfidf_matrix.shape

(3, 305977)

In [17]:
sim_matrix = cosine_similarity(corpus_tfidf_matrix, test_tfidf_matrix)

In [18]:
sim_matrix.shape

(5000, 3)

In [43]:
sim_articles_dic = {}
sim_articles_list = []
threshold = 0.25
for i in range(sim_matrix.shape[0]):
  for j in range(sim_matrix.shape[1]):
    if sim_matrix[i][j] > threshold:
      sim_articles_list.append([i, j, sim_matrix[i][j]])
      sim_articles_dic[str(i)] = text[i]

In [33]:
paragraph_list = []
for key in sim_articles_dic:
    text = sim_articles_dic[key]
    text_dict=dict()
    comp_list= text.split("\n\n")
    text_dict['index'] = key
    for num in range(int((len(comp_list))/2)):
        key_str=str(num)+'_'+str(comp_list[num*2])
        key_str=key_str.strip()
        text_dict[key_str]=str(comp_list[num*2+1])
    paragraph_list.append(text_dict)


In [39]:
sim_articles_list

[[401, 2, 0.28245846198505487],
 [502, 2, 0.2817824050878491],
 [1301, 2, 0.2878332130702234],
 [1342, 2, 0.258659755115504],
 [1600, 2, 0.2577630044642292],
 [2456, 2, 0.25295082342974223],
 [3121, 2, 0.37539424715602265],
 [3207, 2, 0.33085976112834203],
 [3545, 2, 0.28369725740931473],
 [3981, 2, 0.2754481287807529],
 [4000, 2, 0.2834126407078308],
 [4098, 2, 0.26281561144474136],
 [4099, 2, 0.2603616069001906],
 [4476, 0, 0.2507892003265474]]

In [32]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text,max_length=500
                                )

    # Report how long the input sequence is.
    #print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example question through the model.
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    
    
    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]
            
    s_scores = start_scores.detach().numpy().flatten()
    e_scores = end_scores.detach().numpy().flatten()
    #print('score:'+(start_scores)+"; "+str(end_scores))
    #print('score:'+str(max(s_scores))+"; "+str(min(e_scores)))
    #print(str(tensor[torch.argmax(start_scores)]))
    #print('Answer: "' + answer + '"')
    #[answer,str(max(s_scores)),len(input_ids)]
    return answer

In [47]:
text[3121]

"Results\n\nExpression of SARS-CoV-2 S protein. To enhance expression of the S protein of SARS-CoV-2 in mammalian cells, a codonoptimized cDNA encoding the S protein and 3xFLAG tag was synthesized, and to facilitate incorporation of S protein into lentiviral pseudovirons, the last 19 amino acids containing an endoplasmic reticulum (ER)-retention signal from the cytoplasmic tail of the S protein was removed (Fig. 1a) . The construct was named SARS-CoV-2 S. HEK293T cells were transfected with SARS-CoV-2 S plasmid and expression of SARS-CoV-2 S protein was determined by western blot. There were two major bands, 180 kDa, and 90 kDa, detected by mouse anti-FLAG M2 antibody (Fig. 1b, lane 2) , reflecting the full-length and cleaved S proteins, respectively. The band above 250 kDa likely results from dimeric or trimeric S proteins. Consistent with our previous report 29 , MERS-CoV S protein was detected by polyclonal goat anti-MHV S antibodies AO4 (Fig. 1c) . AO4 also detected SARS-CoV-2 and 

In [34]:
question = questions[2]
for dic in paragraph_list:
    for key in dic:
        if key == 'index':
            continue
        ans = answer_question(question, dic[key])
        print (dic['index'])
        print (question)
        print (ans)

401
Diagnosing SARS-COV-2 with antibodies
human ace2
401
Diagnosing SARS-COV-2 with antibodies
[CLS]
401
Diagnosing SARS-COV-2 with antibodies
[CLS]
401
Diagnosing SARS-COV-2 with antibodies
[CLS] diagnosing sars - cov - 2 with antibodies [SEP]
401
Diagnosing SARS-COV-2 with antibodies
[CLS]
401
Diagnosing SARS-COV-2 with antibodies
[CLS]
401
Diagnosing SARS-COV-2 with antibodies
[CLS]
401
Diagnosing SARS-COV-2 with antibodies
diagnosing sars - cov - 2 with antibodies [SEP]
401
Diagnosing SARS-COV-2 with antibodies
[CLS] diagnosing sars - cov - 2 with antibodies [SEP]
401
Diagnosing SARS-COV-2 with antibodies
[CLS]
401
Diagnosing SARS-COV-2 with antibodies
[CLS] diagnosing sars - cov - 2 with antibodies [SEP]
401
Diagnosing SARS-COV-2 with antibodies
[CLS]
401
Diagnosing SARS-COV-2 with antibodies
diffractable crystals of the sars - cov - 2 - ctd / hace2 complex
401
Diagnosing SARS-COV-2 with antibodies
hace2
401
Diagnosing SARS-COV-2 with antibodies
gisaid : epi _ isl _ 402119
401
Dia