In [13]:
import pandas as pd

import numpy as np
import scipy as sp

from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [133]:
import gensim
import sklearn

In [134]:
print(pd.__version__)
print(np.__version__)
print(sp.__version__)
print(gensim.__version__)
print(sklearn.__version__)

2.2.3
1.26.4
1.13.1
4.3.3
1.5.2


In [5]:
df_medquad = pd.read_csv("MedQuad-MedicalQnADataset.csv")
df_medquad.head()

Unnamed: 0,qtype,Question,Answer
0,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,symptoms,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,exams and tests,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,treatment,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."


In [6]:
df_pubmed = pd.read_csv("PubMed_20k_RCT/train.csv")
df_pubmed.head()

Unnamed: 0,abstract_id,line_id,abstract_text,line_number,total_lines,target
0,24293578,24293578_0_12,To investigate the efficacy of 6 weeks of dail...,0,12,OBJECTIVE
1,24293578,24293578_1_12,A total of 125 patients with primary knee OA w...,1,12,METHODS
2,24293578,24293578_2_12,Outcome measures included pain reduction and i...,2,12,METHODS
3,24293578,24293578_3_12,Pain was assessed using the visual analog pain...,3,12,METHODS
4,24293578,24293578_4_12,Secondary outcome measures included the Wester...,4,12,METHODS


# Preprocessing

In [7]:
df_pubmed = df_pubmed.sort_values(by=['line_id'])
df_pubmed = df_pubmed.groupby('abstract_id').agg({'abstract_text':' '.join}).reset_index()
df_pubmed.head()

Unnamed: 0,abstract_id,abstract_text
0,24219770,The purpose of this study was to determine the...
1,24219814,Repeated courses of intravenous ( IV ) aminogl...
2,24219842,To determine whether the Pediatric Asthma Cont...
3,24219852,Despite the benefits of endoscopic nasobiliary...
4,24219882,We assessed the impact of hot flashes and vari...


In [8]:
# add qa id to medquad
df_medquad['qa_id'] = range(1, len(df_medquad) + 1)
df_medquad = df_medquad[['qa_id', 'Question', 'Answer', 'qtype']]
df_medquad.head()

Unnamed: 0,qa_id,Question,Answer,qtype
0,1,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...,susceptibility
1,2,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...,symptoms
2,3,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...,susceptibility
3,4,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos...",exams and tests
4,5,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen...",treatment


# Word Embedding

In [11]:
model_path = 'GoogleNews-vectors-negative300.bin'
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [14]:
def preprocess(sentence):
    return [word for word in sentence.lower().split() if word in word2vec]

def get_sentence_vector(sentence):
    words = preprocess(sentence)
    if words:
        return np.mean([word2vec[word] for word in words], axis=0)
    else:
        return np.zeros(word2vec.vector_size)

In [21]:
df_pubmed['abstract_vector'] = df_pubmed['abstract_text'].apply(get_sentence_vector)
df_medquad['question_vector'] = df_medquad['Question'].apply(get_sentence_vector)

In [23]:
df_pubmed

Unnamed: 0,abstract_id,abstract_text,abstract_vector
0,24219770,The purpose of this study was to determine the...,"[-0.026591245, 0.056547694, -0.0050117737, 0.0..."
1,24219814,Repeated courses of intravenous ( IV ) aminogl...,"[-0.03645952, 0.07120251, 0.021919908, 0.11220..."
2,24219842,To determine whether the Pediatric Asthma Cont...,"[-0.0047280365, 0.041275, -0.00969958, 0.08767..."
3,24219852,Despite the benefits of endoscopic nasobiliary...,"[-0.003629674, 0.07113389, 0.027456231, 0.0854..."
4,24219882,We assessed the impact of hot flashes and vari...,"[-0.020188205, 0.07125171, -0.013315455, 0.093..."
...,...,...,...
14995,26521572,To explore the impacts on the cognitive level ...,"[0.009622574, 0.053992737, 0.025816005, 0.0657..."
14996,26521581,To observe the efficacy of acupuncture on pain...,"[-0.019650908, 0.06335896, 0.009455694, 0.0609..."
14997,26521582,To explore the clinical effect of Erlong Xizhu...,"[0.009212655, 0.03145588, 0.010136922, 0.06303..."
14998,26521589,To observe the myocardial protective effect of...,"[-0.023100011, 0.06358117, 0.022341112, 0.0516..."


In [24]:
df_medquad

Unnamed: 0,qa_id,Question,Answer,qtype,question_vector
0,1,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...,susceptibility,"[-0.011978149, 0.038289387, 0.054300945, 0.098..."
1,2,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...,symptoms,"[0.017163087, 0.12464599, 0.05180664, 0.102880..."
2,3,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...,susceptibility,"[-0.011978149, 0.038289387, 0.054300945, 0.098..."
3,4,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos...",exams and tests,"[0.034423828, 0.15397136, 0.036051434, 0.01643..."
4,5,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen...",treatment,"[-0.011118571, 0.09565226, 0.08443197, 0.07938..."
...,...,...,...,...,...
16402,16403,What are the symptoms of Familial visceral myo...,What are the signs and symptoms of Familial vi...,symptoms,"[-0.006930881, 0.0062052407, -0.021565756, 0.0..."
16403,16404,What is (are) Pseudopelade of Brocq ?,Pseudopelade of Brocq (PBB) is a slowly progre...,information,"[0.073349, -0.03970337, 0.19335938, 0.04766845..."
16404,16405,What are the symptoms of Pseudopelade of Brocq ?,What are the signs and symptoms of Pseudopelad...,symptoms,"[0.027374268, 0.09770203, 0.074645996, 0.11676..."
16405,16406,What are the treatments for Pseudopelade of Br...,Is there treatment or a cure for pseudopelade ...,treatment,"[-0.008605957, 0.06829834, 0.109228514, 0.0857..."


# Information Retrieval

In [124]:
def find_top_k_abstracts(query_vector, abstract_vectors_col, k=3, df=df_pubmed):
    cosine_scores = cosine_similarity([query_vector], np.array(df[abstract_vectors_col]).tolist())[0]
    top_k_indices = np.argsort(cosine_scores)[-k:][::-1]
    return df.iloc[top_k_indices]['abstract_id'].values, cosine_scores[top_k_indices]

def find_top_k_answers(query_vector, question_vectors_col, k=3, df=df_medquad):
    cosine_scores = cosine_similarity([query_vector], np.array(df_medquad[question_vectors_col]).tolist())[0]
    top_k_indices = np.argsort(cosine_scores)[-k:][::-1]
    return df.iloc[top_k_indices]['qa_id'].values, cosine_scores[top_k_indices]

def get_answers(qa_ids):
    return df_medquad[df_medquad['qa_id'].isin(qa_ids)][['qa_id', 'Answer']]

def get_abstracts(abstract_ids):
    return df_pubmed[df_pubmed['abstract_id'].isin(abstract_ids)][['abstract_id', 'abstract_text']]

def query_top_k_answers_and_abstracts(query, k=3):
    query_vector = get_sentence_vector(query)

    qa_ids, question_scores = find_top_k_answers(query_vector, 'question_vector', k)
    abstract_ids, abstract_scores = find_top_k_abstracts(query_vector, 'abstract_vector', k)

    answers_df = get_answers(qa_ids)
    abstracts_df = get_abstracts(abstract_ids)

    answers_df['Similarity Score'] = question_scores
    abstracts_df['Similarity Score'] = abstract_scores

    return answers_df, abstracts_df

In [129]:
queries = [
    "What is the incubation period of COVID-19?",
    "Cure for fever?",
    "Who is at risk for Lymphocytic Choriomeningit",
]

In [130]:
for query in queries:
    print(f"Query: {query}")
    answers_df, abstracts_df = query_top_k_answers_and_abstracts(query)
    print("Top Answers:")
    display(answers_df)
    print("Top Abstracts:")
    display(abstracts_df)
    print("\n")

Query: What is the incubation period of COVID-19?
Top Answers:


Unnamed: 0,qa_id,Answer,Similarity Score
123,124,Yersiniosis is an infectious disease caused by...,0.667042
252,253,On this Page General Information What is vanco...,0.647081
12772,12773,Paramyotonia congenita is an inherited conditi...,0.637043


Top Abstracts:


Unnamed: 0,abstract_id,abstract_text,Similarity Score
2728,24479729,Most randomised clinical trials ( RCTs ) testi...,0.664326
10374,25267326,We recently completed a randomized controlled ...,0.655015
14054,25881022,Specialized Early Intervention services ( SEI ...,0.653418




Query: Cure for fever?
Top Answers:


Unnamed: 0,qa_id,Answer,Similarity Score
1081,1082,Most people with myotonia congenita dont requi...,0.604811
9647,9648,These resources address the diagnosis or manag...,0.604055
10887,10888,These resources address the diagnosis or manag...,0.604055


Top Abstracts:


Unnamed: 0,abstract_id,abstract_text,Similarity Score
1774,24384878,To evaluate long-term cure rates and late comp...,0.543007
2596,24467861,Meaning-focused coping may be at the core of a...,0.525999
4792,24673608,Long-duration beta-lactam antibiotics are used...,0.518751




Query: Who is at risk for Lymphocytic Choriomeningit
Top Answers:


Unnamed: 0,qa_id,Answer,Similarity Score
0,1,LCMV infections can occur after exposure to fr...,1.0
2,3,Individuals of all ages who come into contact ...,1.0
4379,4380,Hemolytic anemia can affect people of all ages...,0.951077


Top Abstracts:


Unnamed: 0,abstract_id,abstract_text,Similarity Score
4377,24637951,Previous studies suggest cross-sectional assoc...,0.81117
4948,24686885,There is little evidence to inform the targete...,0.781331
11980,25467619,To determine the perceived risk of type 2 diab...,0.774183




