<h3> COVID-19 Question Answering </h3>

### Library Import

In [2]:
import nltk
del nltk.data.path[0]


In [4]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag, sent_tokenize
from sklearn.metrics import accuracy_score
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel, BertTokenizer
import torch


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lmanw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lmanw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lmanw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lmanw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


### Load Dataset
Dataset diperoleh dari https://github.com/deepset-ai/COVID-QA

In [None]:
import os 
import pandas as pd
doc_json_path = os.path.join("COVID-QA.json")
json_df = pd.read_json(doc_json_path)
json_df = pd.json_normalize(json_df["data"], record_path="paragraphs")
json_df

In [53]:
import os 
print(
  os.path.isfile(os.path.join("COVID-QA.json"))
)

def doc_preprocessing(doc_json_path):
  data = pd.read_json(doc_json_path)
  data = pd.json_normalize(data["data"], record_path="paragraphs")
  new_data = pd.DataFrame(0, index=np.arange(2019), columns=[
    "document_id", "context", "question_id", "question", "answer_start", "answer_text", "is_impossible"
  ])
  idx = 0
  for i in range(len(data["qas"])):
    for j in range(len(data["qas"][i])):
      new_data["document_id"].iloc[idx] = data["document_id"][i]
      new_data["context"].iloc[idx] = data["context"][i]
      new_data["question_id"].iloc[idx] = data["qas"][i][j]["id"]
      new_data["question"].iloc[idx] = data["qas"][i][j]["question"]
      new_data["answer_start"].iloc[idx] = data["qas"][i][j]["answers"][0]["answer_start"]
      new_data["answer_text"].iloc[idx] = data["qas"][i][j]["answers"][0]["text"]
      new_data["is_impossible"].iloc[idx] = data["qas"][i][j]["is_impossible"]
      idx += 1

  return new_data

new_data = doc_preprocessing(doc_json_path)
new_data.to_csv("document_raw.csv", index=False)

True


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [59]:
# generate unique document
unique_docs = new_data[["context", "document_id"]]
unique_docs = unique_docs.drop_duplicates(keep="first")
unique_docs.to_csv("document_complete.csv", index=False)


### Question Understanding
Understand a question means extracting its keywords by their respective POS label.

In [13]:
# define extract_keyword function.
def extract_keyword(question_postag):
  keyword = ""
  for i in range(0, len(question_postag)):
    if question_postag[i][1]=="NNP" or question_postag[i][1]=="NNS" or question_postag[i][1]=="NN" or (question_postag[i][1]=="JJ" and question_postag[i][0].lower()!="many") or question_postag[i][1]=="CD" or question_postag[i][1]=="RBS" or (question_postag[i][1]=="VBN" and question_postag[i][1]!="been") or (question_postag[i][1]=="VBD" and question_postag[i][0].lower()!="was" and question_postag[i][0].lower()!="were") or question_postag[i][1]=="VBG" or (question_postag[i][1]=="VB" and question_postag[i][0].lower()!="be") or question_postag[i][1]=="RB":
      keyword += question_postag[i][0] + " "
  if len(keyword)!=0 and keyword[len(keyword)-1] == " ":
    keyword = keyword[:-1]
  return keyword

In [158]:
def get_keyword_from_question(input_question):
    quest_token = word_tokenize(input_question)
    postag_question = pos_tag(quest_token)
    quest_key = extract_keyword(postag_question)
    # print("Question    : ", input_question)
    # print("POS TAG     :", postag_question)
    # print("Keyword     : ", quest_key)

    return (quest_token, postag_question, quest_key)

In [88]:
get_keyword_from_question("is covid-19 airborne?")

Question  :  is covid-19 airborne?
POS TAG     : [('is', 'VBZ'), ('covid-19', 'JJ'), ('airborne', 'JJ'), ('?', '.')]
Keyword     :  covid-19 airborne


(['is', 'covid-19', 'airborne', '?'],
 [('is', 'VBZ'), ('covid-19', 'JJ'), ('airborne', 'JJ'), ('?', '.')],
 'covid-19 airborne')

### Document Retrieval
Retrieve all related documents based on question keywords.

Load previously prepared documents.

In [62]:
import os
import pandas as pd

docs_train = pd.read_csv("document_complete.csv")
docs_train.head()

Unnamed: 0,context,document_id
0,Functional Genetic Variants in DC-SIGNR Are As...,630
1,Role of S-Palmitoylation on IFITM5 for the Int...,650
2,First Complete Genome Sequence of a French Bov...,1546
3,Species‐specific clinical characteristics of h...,1545
4,One step closer to an experimental infection s...,1552


Clean up text by removing URLs, citations, and digits.

In [64]:
def clean_sentence(text: str) -> str:
  # clean up string for puctuation, url, etc.
  regex_link = ("((http|https)://)(www.)?" +
            "[a-zA-Z0-9@:%._\\+~#?&//=]" +
            "{2,256}\\.[a-z]" +
            "{2,6}\\b([-a-zA-Z0-9@:%" +
            "._\\+~#?&//=]*)")
  regex_citation = ("(\[\d+-\d+\]|\[\d+(, \d+)*\])")
  regex_digits = "[0-9]"

  link_regex = re.compile(regex_link)
  citation_regex = re.compile(regex_citation)
  digits_regex = re.compile(regex_digits)

  sentence = text.replace('\n', " ")
  sentence = link_regex.sub(" ", sentence)
  sentence = citation_regex.sub(" ", sentence)
  sentence = digits_regex.sub(" ", sentence)

  # remove excess space.
  sentence_arr = sentence.split()
  sentence = " ".join(sentence_arr)

  # lemmatization.
  lemmatizer = WordNetLemmatizer()
  sentence_lowercase = sentence.lower()
  tokens = word_tokenize(sentence_lowercase)
  lemmas = [lemmatizer.lemmatize(t) for t in tokens if t not in stopwords.words("english")]
  lemmas_str = " ".join(lemmas)

  return lemmas_str


In [90]:
# docs_clean_train = [clean_data(punc_remove(sentence)) for sentence in docs_train['context']]
docs_train_clean = [clean_sentence(text) for text in docs_train["context"]]
# docs_train_clean[0:2]
docs_train['cleaned_data'] = docs_train_clean
document_clean = pd.DataFrame({
    "document_id": docs_train["document_id"],
    "text": docs_train_clean
})
document_clean.to_csv("document_clean.csv", index=False)

Term frequency-inverse document frequency

In [92]:
# load previously saved document.
import pandas as pd
doc_train = pd.read_csv("document_clean.csv")

tfidfvectorizer = TfidfVectorizer(analyzer="word", stop_words= "english", max_features=2000)
train_text = tfidfvectorizer.fit_transform(doc_train["text"]).toarray()

tfidf_df = pd.DataFrame(train_text, columns=tfidfvectorizer.get_feature_names())
tfidf_df.head(5)



Unnamed: 0,aa,ab,ability,able,absence,abstract,abundance,access,accession,accessory,...,zhang,zinc,zm,zone,zoonotic,µl,µm,μg,μl,μm
0,0.019085,0.0,0.0,0.0,0.0,0.001578,0.005157,0.003987,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.001572,0.00296,0.027181,0.000841,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004999,0.0025,0.007921
2,0.0,0.216544,0.0,0.0,0.0,0.016696,0.0,0.0,0.283367,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.008809,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.006085,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
vectors = tfidf_df.values.tolist()
doc_train["vector"] = vectors
# doc_train.head(5)
doc_train.to_csv("document_clean_tfidf.csv", index=False)

In [115]:
def parse_array_literal(array_lit):
    array_lit = array_lit.replace("[", "")
    array_lit = array_lit.replace("]", "")
    vals = array_lit.split(",")
    floatvals = [float(a) for a in vals]
    return floatvals

parse_array_literal('[0.019085058375148987, 0.0, 0.0, 0.0, 0.0, 0.0015778062389107314]')

[0.019085058375148987, 0.0, 0.0, 0.0, 0.0, 0.0015778062389107314]

In [96]:
# save tf-df vectorizer instance.
from joblib import dump
dump(tfidfvectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

Document ranking

In [136]:
import numpy as np

def ranking_ir(doc_library, sentence, tfidf_vectorizer, n_rank=10):
  query = clean_sentence(sentence)
  print("input query %s" % query)

  # instantiate and train tf-idf vectorizer.
  vector = tfidf_vectorizer.transform([sentence]).toarray()

  # ranking documents
  doc_candidates = doc_library.copy()
  doc_candidates["vector"] = doc_candidates["vector"].apply(lambda x: parse_array_literal(x))
  doc_candidates['similarity'] = doc_candidates['vector'].apply(lambda x: cosine_similarity(np.array(vector).reshape(1, -1), np.array(x).reshape(1, -1)).item())
  doc_candidates.sort_values(by='similarity',ascending=False,inplace=True)
  
  return doc_candidates[['document_id','text','similarity']].head(n_rank).reset_index(drop=True)

Find the best document candidates.

In [119]:
import os
import pandas as pd

# load document library.
doc_lib = pd.read_csv(os.path.join("document_clean_tfidf.csv"))

# prepare question and extract its keyword.
input_question = "is covid-19 airborne?"
result = get_keyword_from_question(input_question)
question_keyword = result[2]
print(question_keyword)

# prepare tf-idf vectorizer, either load from existing or train the vectorizer.
import joblib
# tfidf_vectorizer = TfidfVectorizer(analyzer="word", stop_words="english", max_features=1000)
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
ranking_ir(doc_lib, question_keyword, tfidf_vectorizer)


Question  :  is covid-19 airborne?
POS TAG     : [('is', 'VBZ'), ('covid-19', 'JJ'), ('airborne', 'JJ'), ('?', '.')]
Keyword     :  covid-19 airborne
covid-19 airborne
input query covid- airborne


Unnamed: 0,document_id,text,similarity
0,185,"cdc summary mar , rapidly evolving situation c...",0.687228
1,2450,safe patient transport covid- sha : ec eb bcca...,0.460365
2,188,battle coronavirus disease ( covid- ) : emerge...,0.397438
3,1559,covid- smoking : systematic review evidence sh...,0.314975
4,2522,identification covid- quicker artificial intel...,0.297302
5,2432,factor associated mental health outcome among ...,0.231897
6,2642,first case coronavirus disease ( covid- ) euro...,0.20159
7,2643,responding covid- pandemic complex humanitaria...,0.186667
8,2440,optimization method forecasting confirmed case...,0.163736
9,2527,coronavirus ( covid- ) outbreak smart city net...,0.159254


### Answer Generator

In [123]:
# download model and tokenizer if not exists.
from transformers import BertTokenizer, BertModel
tokenizer = None
model = None
tokenizer_path = os.path.join('tokenizer', 'distilbert-base-uncased/')
if os.path.exists(tokenizer_path):
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path, padding=True, truncation=True, max_length=32)
else:
    tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', padding=True, truncation=True, max_length=32)
    tokenizer.save_pretrained(tokenizer_path)

model_path = os.path.join('model', 'distilbert-base-uncased')
if os.path.exists(model_path):
    model = BertModel.from_pretrained(model_path, output_hidden_states=True)
else:
    model = BertModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
    model.save_pretrained(os.path.join('model', 'distilbert-base-uncased/'))
    model.eval()

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.5.attention.k_lin.weight', 'distilbert.transformer.layer.0.output_layer_norm.bias', 'distilbert.transformer.layer.2.attention.v_lin.weight', 'distilbert.transformer.layer.2.attention.k_lin.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.3.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.5.output_layer_norm.weight', 'distilbert.transformer.layer.4.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.v_lin.bias', 'distilbert.t

In [124]:
# Return vector representing question.
# param question - a question/sentence : string
# return tensor representing question/sentence
def get_sentence_vector(question):
  marked_question = ' [CLS] ' + question + ' [SEP] '
  tokenized_question = tokenizer.tokenize(marked_question)
  indexed_question = tokenizer.convert_tokens_to_ids(tokenized_question)
  segment_ids = [1] * len(tokenized_question)
  question_tensor = torch.tensor([indexed_question])
  segment_ids_tensor = torch.tensor([segment_ids])

  # Eval the model.
  outputs = []
  with torch.no_grad():
    outputs = model(question_tensor, segment_ids_tensor)

  # Extract hidden states.
  hidden_states = outputs[2]
  token_embeddings = torch.stack(hidden_states, dim=0)
  token_embeddings = torch.squeeze(token_embeddings, dim=1)
  
  # Extract last four layers to get word embeddings for current question.
  question_vectors = []
  for token in token_embeddings:
    # token[i] means token representation from hidden states i-th layer.
    cat_vectors = torch.cat((token[-1], token[-2], token[-3]))
    question_vectors.append(cat_vectors)

  tensor_stack = torch.stack(question_vectors)
  return tensor_stack

In [125]:
# Calculate similarity
# param s1, s2 - sentence in tensor representation
# return score
def calculate_similarity_BERT(t1: torch.Tensor, t2: torch.Tensor, dim=1):
  # s1 = get_sentence_vector(s1)
  # s2 = get_sentence_vector(s2)
  cos = torch.nn.CosineSimilarity(dim=dim, eps=1e-6)
  tensor_score = cos(t1, t2)
  average_score = torch.mean(tensor_score)
  return average_score

In [190]:
def find_expected_answer_with_keyword(query, dataset):
  dataset = dataset.reset_index()
  question_list = dataset["question"].tolist()
  similarity_scores = [calculate_similarity_BERT(
      get_sentence_vector(get_keyword_from_question(query)[2]),
      get_sentence_vector(get_keyword_from_question(q)[2])
  ) for q in question_list]
  best_score = max(similarity_scores)
  idx = similarity_scores.index(best_score)
  expected_question = dataset.loc[idx]["question"]
  expected_answer = dataset.loc[idx]["answer_text"]
  return expected_question, expected_answer, best_score.item()


In [191]:
# Test answer finder function.

import pandas as pd
import os

doc_lib = pd.read_csv("document_raw.csv")
doc_samples = doc_lib.head(10)
q, a, b = find_expected_answer_with_keyword("what is covid-19", doc_samples)
print(q, a, b)

What is the role of C-C Motif Chemokine Ligand 3 Like 1 (CCL3L1) in mother to child transmission of HIV-1? High copy numbers of CCL3L1, a potent HIV-1 suppressive ligand for CCR5, are associated with higher chemokine production and lower risk of MTCT of HIV-1 among South African infants 0.6540467739105225


### Integration

In [None]:
# Sample questions.
questions = ['what is causing HIV in children?', 'what is MERS-CoV?', 'How to treat COVID19?', 'What causes the outbreak of SARS and MERS?']
# keywords = [extract_keyword(pos_tag(word_tokenize(q))) for q in questions]
keywords_tuples = [get_keyword_from_question(q) for q in questions]
keywords = [a[2] for a in keywords_tuples]
keywords

In [137]:
# find document candidates according to questions.
import pandas as pd
import os
import joblib

tfidf_vectorizer = joblib.load(os.path.join("tfidf_vectorizer.pkl"))
questions = ['what is causing HIV in children?', 'what is MERS-CoV?', 'How to treat COVID19?', 'What causes the outbreak of SARS and MERS?']
doc_candidates = [ranking_ir(doc_lib, sent, tfidf_vectorizer, n_rank=5) for sent in questions]
doc_candidates


input query causing hiv child ?
input query mers-cov ?
input query treat covid ?
input query cause outbreak sars mers ?


[   document_id                                               text  similarity
 0         1580  isothermal amplification using chemical heatin...    0.277293
 1          630  functional genetic variant dc-signr associated...    0.228400
 2         1656  improved pharmacological structural property h...    0.149732
 3         1686  nucleolar protein trafficking response hiv- ta...    0.135041
 4         1730  architectural insight inovirus-associated vect...    0.128037,
    document_id                                               text  similarity
 0         1741  mers coronavirus : diagnostics , epidemiology ...    0.791921
 1         1671  host resilience emerging coronaviruses sha : f...    0.622524
 2         1551  demographic variation mers-cov infection among...    0.554982
 3         2459  credible evidence supporting claim laboratory ...    0.414739
 4         2486  potential rapid diagnostics , vaccine therapeu...    0.411049,
    document_id                                   

In [195]:
# find answers.
import pandas as pd
import os

doc_lib = pd.read_csv("document_raw.csv")
for i in range(len(questions)):
    document_ids = doc_candidates[i]["document_id"].values.tolist()
    qa_candidates = []
    for doc_id in document_ids:
        selected_doc = doc_lib[doc_lib["document_id"] == doc_id]
        q, a, b = find_expected_answer_with_keyword(questions[i], selected_doc)
        qa_candidates.append([q, a, b])
    qa_df = pd.DataFrame(qa_candidates, columns=["question", "answer", "score"])
    sorted = qa_df.sort_values(by=["score"])
    print(qa_df.loc[0])


question    What percentage of patients do not return for ...
answer                                              35 to 50%
score                                                0.641516
Name: 0, dtype: object
question    What are recommended when URT sampling is to  ...
answer       an oropharyngeal and throat swab or a nasopha...
score                                                0.822367
Name: 0, dtype: object
question         What is RANBP2?
answer      nuclear pore protein
score                   0.812116
Name: 0, dtype: object
question    What symptoms appear among the  confirmed case...
answer      fever, cough and upper respiratory tract (URT)...
score                                                0.988275
Name: 0, dtype: object
