### The purpose of this notebook is to develop a search tool that searches African/African American coronavirus-related publications and provides related publications including the title, authors, abstract, and top 5 related sentences. 

In [4]:
import numpy as np 
import pandas as pd 
import os
import re
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import plotly.express as px
from PIL import Image

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import preprocess_string
from gensim.summarization.textcleaner import get_sentences
from gensim.summarization import summarize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.corpora import Dictionary

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display, HTML

import warnings 
warnings.filterwarnings('ignore')
import time
from collections import Counter
from tqdm import tqdm

  import pandas.util.testing as tm


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
cord_africa = pd.read_csv('/content/drive/My Drive/nss_data_science/covid_query/data/cord_africa.csv')

In [0]:
# Concatenating title, abstract and body_text
cord_search = cord_africa.loc[(cord_africa.title.notna()&cord_africa.authors.notna()&cord_africa.abstract.notna()), :].reset_index(drop=True)

cord_search["paper"] = cord_search.title + " " + cord_search.abstract + " " + cord_search.body_text

In [0]:
# Cleaning and preprocessing (tokenization and filtering)
cord_search["paper_token"] = cord_search.paper.apply(preprocess_string)

Doc2Vec Model

In [0]:
# Preparing inputs to the doc2vec model
n_papers = cord_search.shape[0]
cord_search["paper_tag"] = [TaggedDocument(words=cord_search.paper_token[i],tags=[i]) for i in range(n_papers)]

In [0]:
# Training and saving the doc2vec model
model_doc2vec = Doc2Vec(cord_search.paper_tag.values, dm=1, vector_size=200, window=5, min_count=1, workers=8, epochs=5)

Finding n closest papers to search

In [0]:
# A function to detect tags in texts
def TagInText(text, tag_list):
    tagIn = False
    for tag in tag_list:
        match = re.match(".* {}.*|.* {}.*|.* {}.*".format(tag.lower(), tag.upper(), tag.capitalize()), text)
        if match:
            tagIn=True
    
    return tagIn

Training a word to vec model

In [0]:
model_word2vec = Word2Vec(sentences=cord_search.paper_token, size=100,window=5,min_count=1, seed=1,  sg =0)
def SimilarWords(tags):
    tag_list = preprocess_string(tags)
    similar_words_list = []
    for tag in tag_list:
        #try is important here to avoid errors if a word is not in the vocabulary
        try:
            similar_words = model_word2vec.wv.most_similar(tag.lower())
            similar_words = [word_sim[0] for word_sim in similar_words]
            similar_words_list = similar_words_list + similar_words
        except:
            pass
    
    return list(set(similar_words_list)) #using set allows to filter out duplicates

In [0]:
#A function that finds indexes of papers matching to the search
def SearchIndex(search, data, column, model_doc2vec, tags=None, topn=10, seed=0):
    
    """
    Take a search string and return list of line indexes of n closest papers
    
    search: string containing the query
    data: papers dataframe in the right format
    column: column in "data" containing papers texts
    model_doc2vec: a Doc2Vec model trained on papers from "data"
    tags: string with tag words
    topn: maximum number of results to be returned,
    seed: an integer for random seed; useful for the doc2vec model. 
    """
        
    n = data.shape[0]
    model_doc2vec.random.seed(seed)
    search = preprocess_string(search)
    search_vec = model_doc2vec.infer_vector(search)
    similar_text_index = model_doc2vec.docvecs.most_similar([search_vec], topn = n)
    similar_text_index = [index_sim[0] for index_sim in similar_text_index  if index_sim[0] in data.index]
    
    if tags: 
        tag_list = preprocess_string(tags)
        similar_words = SimilarWords(tags)
        tag_list = tag_list + similar_words
        
        filter_papers_index = []

        #finding articles containing tags
        tagged = data[column].apply(lambda paper: TagInText(paper,tag_list))
        tagged_papers_index = data[tagged].index
        
        similar_text_index = [index for index in similar_text_index if index in tagged_papers_index]

    
    similar_text_index = similar_text_index[:topn]

    
    
    return similar_text_index

In [13]:
def f(Search):
    search.value = str(Search)
    
text = 'Enter search here:'
search = widgets.Text()
interact(f,Search=search)
button = widgets.Button(description="Run")
output = widgets.Output()
display(button, output)

def on_button_clicked(Search):
    with output:
        print(str(search.value))

button.on_click(on_button_clicked)


interactive(children=(Text(value='', description='Search'), Output()), _dom_classes=('widget-interact',))

Button(description='Run', style=ButtonStyle())

Output()

In [14]:
search

Text(value='what is covid risk', description='Search')

In [15]:
search = str(search.value)
tags = nltk.word_tokenize(search)
similar_tags = SimilarWords(search)
tags = tags + similar_tags
tags = (', '.join(str(x) for x in tags))
tags

'what, is, covid, risk, chanc, true, awar, situat, worst, impact, occup, ongo, contributori, hazard, evd, burden, consequ, vulner, instanc, pandem, agriculturerel, danger, underestim, actual'

In [16]:
select_indexes = SearchIndex(search=search, data=cord_search, column="paper", model_doc2vec=model_doc2vec, topn=5)
select_indexes

[184, 492, 845, 318, 403]

In [17]:
select_indexes = SearchIndex(search=search, data=cord_search, column="paper", model_doc2vec=model_doc2vec, topn=5)
select_indexes

##For each relevant paper, find the top n sentences matching the search (in the body text)
#generating a dataframe for sentences
sentences_df = pd.DataFrame(columns=["paper_index", "sentence"])

for paper_index in select_indexes:

    sentences_list = list(get_sentences(cord_search.loc[paper_index, "body_text"]))
    n_sent = len(sentences_list)
    new_sent_df = pd.DataFrame(list(zip([paper_index]*n_sent, sentences_list)) , columns=["paper_index", "sentence"])
    sentences_df = sentences_df.append(new_sent_df)
sentences_df

Unnamed: 0,paper_index,sentence
0,184,outbreaks.
1,184,Nigeria's current national health systems cann...
2,184,This has grim implications for Nigeria especia...
3,184,The provision of quarantine or isolation facil...
4,184,Conclusion: There is an urgent need to put in ...
...,...,...
26,403,The drug should be active in cells of the nerv...
27,403,"For example, the unfavorable toxicity profile ..."
28,403,"In addition, a lesson learned from treating ot..."
29,403,Given the time needed to develop and approve n...


In [0]:
#sentences preprocessing by filtering out non relevant sentences 

pattern = ".*doi.*|.*http.*|.*copyright.*|.*author.*|.*license.*|.*without permission.*|.*CC-BY-NC-ND.*|.*author.*|.*funder.*|.*medrxiv.*|.*fig.*|.*all rights.*"
sentences_df["filtered"] = sentences_df["sentence"].apply(lambda sent: True if re.match(pattern, sent.lower()) else False)

sentences_df = sentences_df.loc[~sentences_df.filtered,:]
sentences_df = sentences_df.reset_index(drop=True)

sentences_df.head()

sentences_df["sentence_token"] = sentences_df.sentence.apply(preprocess_string)

In [0]:
sentences_df = sentences_df.dropna().reset_index(drop=True)

In [20]:
#function to get the relevant sentence
#It is possible to not get any relevant sentence cause we only keep the ones that contain at least one tag or one similar word. 
def RelevantSent(search, papers_indexes, topn_sent=5, tags=None):
    """
    This is a function that look for more relevant sentences in a list of papers given a search.
    It returns a dictionary which keys are papers indexes and values are list of top n relevant sentences
    """
    model_doc2vec.random.seed(1)
    search = preprocess_string(search)
    search_vec = model_doc2vec.infer_vector(search)

    sentences_df["sent_vec"] = sentences_df.sentence_token.apply(model_doc2vec.infer_vector).values
    sentences_df["cos_similarity"] = sentences_df.sent_vec.apply(lambda sent_vec: cosine_similarity(search_vec.reshape(-1,1), sent_vec.reshape(-1,1))[0][0])

    topn_sent_dict = {}
    
    if tags:
        tags = tags
            
    for paper_index in papers_indexes:
        sent_paper_order = sentences_df.loc[sentences_df.paper_index==paper_index, :]
        sent_paper_order = sent_paper_order.sort_values(by="cos_similarity", axis=0, ascending=False).reset_index(drop=True)

        
        #keeping only sentences with at least one tag word
        sent_list_tag = []
        if tags:
            sent_list = list(sent_paper_order.loc[:10000, "sentence"])
            for sent in sent_list:
                tagIn = TagInText(sent, tags)
                if tagIn is True:
                    sent_list_tag.append(sent)

        else:
            sent_list_tag = list(sent_paper_order.loc[:topn_sent-1, "sentence"])
            
        topn_sent_dict[paper_index] = sent_list_tag[:topn_sent]
        
    return topn_sent_dict

top_sent = RelevantSent(search, select_indexes, topn_sent=5, tags=tags)
paper_index = select_indexes[3]
top_sent[paper_index]

['We declare no other competing interests.',
 'The study should also be interpreted in light of the fast-evolving nature of the COVID-19 outbreak.',
 'However, as of Feb 25, 2020, more than 40 countries would have been capacitated to accurately diagnose COVID-19 infection, thanks to the coordination efforts of AFTCOR.',
 'Because mitigating the potential spread of COVID-19 in Africa will require rapid detection and containment, the laboratory work streams of AFTCOR, Africa CDC, and WHO are working closely to expeditiously scale up diagnostic testing capacity linked to enhanced surveillance and monitoring-eg, at the beginning of February, only two countries in Africa had the diagnostic capacity to test for COVID-19.',
 '2 The greatest concern for public health experts is whether COVID-19 will become a pandemic, with sustained year-round transmission, similar to influenza, as is now being observed in several countries.']

In [21]:
paper_index

318

In [22]:
select_indexes

[184, 492, 845, 318, 403]

In [0]:
def SearchDisplayPaper(search, model_doc2vec, topn=len(top_sent), topn_sent=5, tags=None):
    """
    Take a request and return most relevant papers, with their titles, authors, abstracts and most relevant sentences
    
    search: string containing the query
    model_doc2vec: a Doc2Vec model trained on papers from "data"ay2
    tags: string with tag words
    topn: maximum number of results to be returned,
    seed: an integer for random seed; useful for the doc2vec model. 
    """
    
    select_indexes = SearchIndex(search=search, data=cord_search, column="paper", model_doc2vec=model_doc2vec, tags=tags, topn=topn)
    top_sent = RelevantSent(search, select_indexes, topn_sent=topn_sent, tags=tags)

    if tags:
        tag_list = tags.split()
        similar_words = SimilarWords(tags)
            
    summary = ""
    
    for i,index in enumerate(select_indexes):
        title, authors, abstract, doi = list(cord_search.loc[index, ["title", "authors", "abstract", "doi"]])
        
        sentences = top_sent[index]
        top_sentences = [
                """
                <h4 style="text-align: justify;"><span style="color: #000000;">{sent}</span></span></h4>
                """.format(nb=i+1, sent=sent) for i, sent in enumerate(sentences) 
                ]

        top_sentences = " ".join(top_sentences)
            
        summary = summary + """
        <p>&nbsp;</p>
        <h3><span style="background-color: #ffffff;"><a href="https://doi.org/{doi}" target="_blank">{title}</a></span></span></h3>
        <p><strong><span style="background-color: #ffffff;"><span style="background-color: #ffffff;">{authors} </span></span></p>
        <p>&nbsp;</p>
        
        """.format(nb=i+1, title=title, authors=authors, abstract=abstract, doi=doi)
        
        
        summary = summary + top_sentences
        
    summary = HTML(summary)
    
    return display(summary)

In [24]:
SearchDisplayPaper(search, model_doc2vec=model_doc2vec, topn=len(top_sent), topn_sent=5, tags=tags)