In [1]:
!pip install rank_bm25 nltk

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/d2/e4/38d03d6d5e2deae8d2838b81d6ba2742475ced42045f5c46aeb00c5fb79c/rank_bm25-0.2.tar.gz
Building wheels for collected packages: rank-bm25
  Building wheel for rank-bm25 (setup.py): started
  Building wheel for rank-bm25 (setup.py): finished with status 'done'
  Stored in directory: C:\Users\ankur\AppData\Local\pip\Cache\wheels\6f\0c\1f\78945dd6a5478bbcdb50d73ac96ae5af2ffcdfcd374fd9b1bf
Successfully built rank-bm25
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path, PurePath
import pandas as pd
import requests
from requests.exceptions import HTTPError, ConnectionError
from ipywidgets import interact
import ipywidgets as widgets
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd

def set_column_width(ColumnWidth, MaxRows):
    pd.options.display.max_colwidth = ColumnWidth
    pd.options.display.max_rows = MaxRows
    print('Set pandas dataframe column width to', ColumnWidth, 'and max rows to', MaxRows)
    
interact(set_column_width, 
         ColumnWidth=widgets.IntSlider(min=50, max=400, step=50, value=200),
         MaxRows=widgets.IntSlider(min=50, max=500, step=100, value=100));

interactive(children=(IntSlider(value=200, description='ColumnWidth', max=400, min=50, step=50), IntSlider(val…

In [4]:
# Where are all the files located
input_dir = PurePath('2020-03-13')

# The all sources metadata file
metadata = pd.read_csv(input_dir / 'all_sources_metadata_2020-03-13.csv', 
                      dtype={'Microsoft Academic Paper ID': str,
                             'pubmed_id': str})

# Convert the doi to a url
def doi_url(d): return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'
metadata.doi = metadata.doi.fillna('').apply(doi_url)

# Set the abstract to the paper title if it is null
metadata.abstract = metadata.abstract.fillna(metadata.title)

In [5]:
len(metadata)

29500

In [8]:
metadata.describe()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
count,17420,29500,29130,29500,27337,16730,17692,29130,18248,28554,17791,1134,1236,17420
unique,17398,4,24654,22203,23222,12607,15,24753,4499,23860,1732,872,1223,2
top,72a5640aa0c307fbe171ca7ad55d3fda48b53988,PMC,Articles of Significant Interest Selected from This Issue by the Editors,http://doi.org/,PMC6224041,30408032,CC BY,Articles of Significant Interest Selected from This Issue by the Editors,2020,"['Ehrt, Christiane', 'Brinkjost, Tobias', 'Koch, Oliver']",PLoS One,3006645647,#454,True
freq,4,27337,67,3143,35,35,11575,67,1148,35,2204,15,2,13219


In [9]:
# Some papers are duplicated since they were collected from separate sources. Thanks Joerg Rings
duplicate_paper = ~(metadata.title.isnull() | metadata.abstract.isnull()) & (metadata.duplicated(subset=['title', 'abstract']))
metadata = metadata[~duplicate_paper].reset_index(drop=True)

In [10]:
len(metadata)

25133

In [11]:
# Create Data Classes for the Research Dataset and Papers

def get(url, timeout=6):
    try:
        r = requests.get(url, timeout=timeout)
        return r.text
    except ConnectionError:
        print(f'Cannot connect to {url}')
        print(f'Remember to turn Internet ON in the Kaggle notebook settings')
    except HTTPError:
        print('Got http error', r.status, r.text)

class DataHolder:
    '''
    A wrapper for a dataframe with useful functions for notebooks
    '''
    def __init__(self, data: pd.DataFrame):
        self.data = data
        
    def __len__(self): return len(self.data)
    def __getitem__(self, item): return self.data.loc[item]
    def head(self, n:int): return DataHolder(self.data.head(n).copy())
    def tail(self, n:int): return DataHolder(self.data.tail(n).copy())
    def _repr_html_(self): return self.data._repr_html_()
    def __repr__(self): return self.data.__repr__()


class ResearchPapers:
    
    def __init__(self, metadata: pd.DataFrame):
        self.metadata = metadata
        
    def __getitem__(self, item):
        return Paper(self.metadata.iloc[item])
    
    def __len__(self):
        return len(self.metadata)
    
    def head(self, n):
        return ResearchPapers(self.metadata.head(n).copy().reset_index(drop=True))
    
    def tail(self, n):
        return ResearchPapers(self.metadata.tail(n).copy().reset_index(drop=True))
    
    def abstracts(self):
        return self.metadata.abstract.dropna()
    
    def titles(self):
        return self.metadata.title.dropna()
        
    def _repr_html_(self):
        return self.metadata._repr_html_()
    
class Paper:
    
    '''
    A single research paper
    '''
    def __init__(self, item):
        self.paper = item.to_frame().fillna('')
        self.paper.columns = ['Value']
    
    def doi(self):
        return self.paper.loc['doi'].values[0]
    
    def html(self):
        '''
        Load the paper from doi.org and display as HTML. Requires internet to be ON
        '''
        text = get(self.doi())
        return widgets.HTML(text)
    
    def text(self):
        '''
        Load the paper from doi.org and display as text. Requires Internet to be ON
        '''
        text = get(self.doi())
        return text
    
    def abstract(self):
        return self.paper.loc['abstract'].values[0]
    
    def title(self):
        return self.paper.loc['title'].values[0]
    
    def authors(self, split=False):
        '''
        Get a list of authors
        '''
        authors = self.paper.loc['authors'].values[0]
        if not authors:
            return []
        if not split:
            return authors
        if authors.startswith('['):
            authors = authors.lstrip('[').rstrip(']')
            return [a.strip().replace("\'", "") for a in authors.split("\',")]
        
        # Todo: Handle cases where author names are separated by ","
        return [a.strip() for a in authors.split(';')]
        
    def _repr_html_(self):
        return self.paper._repr_html_()
    

papers = ResearchPapers(metadata)

In [14]:
papers[11].authors(split=True)

['Quilty, Billy J',
 'Clifford, Sam',
 'group2, CMMID nCoV working',
 'Flasche, Stefan',
 'Eggo, Rosalind M']

In [15]:
papers[11]

Unnamed: 0,Value
sha,4e550e034ccca6fa2a91e481ddba24db67bc9ae5
source_x,CZI
title,Effectiveness of airport screening at detecting travellers infected with novel coronavirus (2019-nCoV)
doi,http://doi.org/10.2807/1560-7917.ES.2020.25.5.2000080
pmcid,
pubmed_id,
license,cc-by
abstract,We simulated 100 2019-nCoV infected travellers planning to board a flight who would pose a risk for seeding transmission in a new region. The duration of travel was considered as the flight time p...
publish_time,2020
authors,"Quilty, Billy J; Clifford, Sam; group2, CMMID nCoV working; Flasche, Stefan; Eggo, Rosalind M"


In [17]:
papers[0].html()

HTML(value='<!DOCTYPE html>\n<html lang="en" class="no-js">\n<head>\n    <meta charset="UTF-8"/>\n    <meta ht…

In [19]:
papers[0].text()[:1000]

'<!DOCTYPE html>\n<html lang="en" class="no-js">\n<head>\n    <meta charset="UTF-8"/>\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <meta name="access" content="Yes">\n    \n\n    <meta name="journal_id" content="134"/>\n\n    <meta name="dc.title" content="Angiotensin-converting enzyme 2 (ACE2) as a SARS-CoV-2 receptor: molecular mechanisms and potential therapeutic target"/>\n\n    <meta name="dc.source" content="Intensive Care Medicine 2020"/>\n\n    <meta name="dc.format" content="text/html"/>\n\n    <meta name="dc.publisher" content="Springer"/>\n\n    <meta name="dc.date" content="2020-03-03"/>\n\n    <meta name="dc.type" content="BriefCommunication"/>\n\n    <meta name="dc.language" content="En"/>\n\n    <meta name="dc.copyright" content="2020 The Author(s)"/>\n\n    <meta name="dc.rightsAgent" content="journalpermissions@springernature.com"/>\n\n    <meta name="dc.description" content="

In [20]:
papers.head(2)

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SARS-CoV-2 receptor: molecular mechanisms and potential therapeutic target,http://doi.org/10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,Angiotensin-converting enzyme 2 (ACE2) as a SARS-CoV-2 receptor: molecular mechanisms and potential therapeutic target,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; Zhong, Nanshan; Slutsky, Arthur S.",Intensive Care Med,2002765492,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coronavirus (2019-nCoV/SARS-CoV-2) receptor ACE2 in different populations,http://doi.org/10.1038/s41421-020-0147-1,,,cc-by,Comparative genetic analysis of the novel coronavirus (2019-nCoV/SARS-CoV-2) receptor ACE2 in different populations,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengqing; Huang, Peide; Sun, Xiaohui; Wen, Fang; Huang, Xuanlin; Ning, Guang; Wang, Weiqing",Cell Discovery,3003430844,#1861,True


In [21]:
papers.head(2).abstracts()

0    Angiotensin-converting enzyme 2 (ACE2) as a SARS-CoV-2 receptor: molecular mechanisms and potential therapeutic target
1       Comparative genetic analysis of the novel coronavirus (2019-nCoV/SARS-CoV-2) receptor ACE2 in different populations
Name: abstract, dtype: object

In [22]:
papers.head(2).titles()

0    Angiotensin-converting enzyme 2 (ACE2) as a SARS-CoV-2 receptor: molecular mechanisms and potential therapeutic target
1       Comparative genetic analysis of the novel coronavirus (2019-nCoV/SARS-CoV-2) receptor ACE2 in different populations
Name: title, dtype: object

## BM25 search index:

In [23]:
from rank_bm25 import BM25Okapi

In [24]:
# Text preprocessing
english_stopwords = list(set(stopwords.words('english')))

def strip_characters(text):
    t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
    t = re.sub('/', ' ', t)
    t = t.replace("'",'')
    return t

def clean(text):
    t = text.lower()
    t = strip_characters(t)
    return t

def tokenize(text):
    words = nltk.word_tokenize(text)
    return list(set([word for word in words 
                     if len(word) > 1
                     and not word in english_stopwords
                     and not (word.isnumeric() and len(word) is not 4)
                     and (not word.isnumeric() or word.isalpha())] )
               )

def preprocess(text):
    t = clean(text)
    tokens = tokenize(t)
    return tokens

class SearchResults:
    
    def __init__(self, 
                 data: pd.DataFrame,
                 columns = None):
        self.results = data
        if columns:
            self.results = self.results[columns]
            
    def __getitem__(self, item):
        return Paper(self.results.loc[item])
    
    def __len__(self):
        return len(self.results)
        
    def _repr_html_(self):
        return self.results._repr_html_()

SEARCH_DISPLAY_COLUMNS = ['title', 'abstract', 'doi', 'authors', 'journal']

class WordTokenIndex:
    
    def __init__(self, 
                 corpus: pd.DataFrame, 
                 columns=SEARCH_DISPLAY_COLUMNS):
        self.corpus = corpus
        raw_search_str = self.corpus.abstract.fillna('') + ' ' + self.corpus.title.fillna('')
        self.index = raw_search_str.apply(preprocess).to_frame()
        self.index.columns = ['terms']
        self.index.index = self.corpus.index
        self.columns = columns
    
    def search(self, search_string):
        search_terms = preprocess(search_string)
        result_index = self.index.terms.apply(lambda terms: any(i in terms for i in search_terms))
        results = self.corpus[result_index].copy().reset_index().rename(columns={'index':'paper'})
        return SearchResults(results, self.columns + ['paper'])

In [25]:
# https://pypi.org/project/rank-bm25/
class RankBM25Index(WordTokenIndex):
    
    def __init__(self, corpus: pd.DataFrame, columns=SEARCH_DISPLAY_COLUMNS):
        super().__init__(corpus, columns)
        self.bm25 = BM25Okapi(self.index.terms.tolist())
        
    def search(self, search_string, n=4):
        search_terms = preprocess(search_string)
        doc_scores = self.bm25.get_scores(search_terms)
        ind = np.argsort(doc_scores)[::-1][:n]
        results = self.corpus.iloc[ind][self.columns]
        results['Score'] = doc_scores[ind]
        results = results[results.Score > 0]
        return SearchResults(results.reset_index(), self.columns + ['Score'])
    
bm25 = RankBM25Index(metadata.head(100))
bm25.search('cruise')

Unnamed: 0,title,abstract,doi,authors,journal,Score
0,Backcalculating the Incidence of Infection with COVID-19 on the Diamond Princess,"To understand the time-dependent risk of infection on a cruise ship, the Diamond Princess, I estimated the incidence of infection with novel coronavirus (COVID-19). The epidemic curve of a total o...",http://doi.org/10.3390/jcm9030657,"Nishiura, Hiroshi",J Clin Med,3.303773
1,Estimation of the reproductive number of Novel Coronavirus (COVID-19) and the probable outbreak size on the Diamond Princess cruise ship: A data-driven analysis,"Backgrounds Up to February 16, 2020, 355 cases have been confirmed as having COVID-19 infection on the Diamond Princess cruise ship. It is of crucial importance to estimate the reproductive number...",http://doi.org/10.1016/j.ijid.2020.02.033,"Zhang, Sheng; Diao, MengYuan; Yu, Wenbo; Pei, Lei; Lin, Zhaofen; Chen, Dechang",International Journal of Infectious Diseases,2.635377
2,COVID-19 outbreak on the Diamond Princess cruise ship: estimating the epidemic potential and effectiveness of public health countermeasures,"Cruise ships carry a large number of people in confined spaces with relative homogeneous mixing. On 3 February, 2020, an outbreak of COVID-19 on cruise ship Diamond Princess was reported with 10 i...",http://doi.org/10.1093/jtm/taaa030,"Rocklöv, J.; Sjödin, H.; Wilder-Smith, A.",Journal of Travel Medicine,2.477118


In [26]:
bm25.search('sars-cov-2')

Unnamed: 0,title,abstract,doi,authors,journal,Score
0,SARS-CoV-2 infection in children: Transmission dynamics and clinical characteristics,SARS-CoV-2 infection in children: Transmission dynamics and clinical characteristics,http://doi.org/10.1016/j.jfma.2020.02.009,"Cao, Qing; Chen, Yi-Ching; Chen, Chyi-Liang; Chiu, Cheng-Hsun",Journal of the Formosan Medical Association,2.617278
1,A potential role for integrins in host cell entry by SARS-CoV-2,A potential role for integrins in host cell entry by SARS-CoV-2,http://doi.org/10.1016/j.antiviral.2020.104759,"Sigrist, Christian; Bridge, Alan; Le Mercier, Philippe",Antiviral Research,2.617278
2,"Differential diagnosis of illness in patients under investigation for the novel coronavirus (SARS-CoV-2), Italy, February 2020","Differential diagnosis of illness in patients under investigation for the novel coronavirus (SARS-CoV-2), Italy, February 2020",http://doi.org/10.2807/1560-7917.ES.2020.25.8.2000170,"Bordi, Licia; Nicastri, Emanuele; Scorzolini, Laura; Caro, Antonino Di; Capobianchi, Maria Rosaria; Castilletti, Concetta; Lalle, Eleonora; group, on behalf of INMI COVID-19 study; Centers2, Colla...",Eurosurveillance,2.531772
3,Angiotensin-converting enzyme 2 (ACE2) as a SARS-CoV-2 receptor: molecular mechanisms and potential therapeutic target,Angiotensin-converting enzyme 2 (ACE2) as a SARS-CoV-2 receptor: molecular mechanisms and potential therapeutic target,http://doi.org/10.1007/s00134-020-05985-9,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; Zhong, Nanshan; Slutsky, Arthur S.",Intensive Care Med,2.531772


In [27]:
# search engine on 10000 records
bm25_index = RankBM25Index(metadata.head(10000))

In [28]:
results = bm25_index.search('cruise ship')
results

Unnamed: 0,title,abstract,doi,authors,journal,Score
0,"COVID-19 in 2 Persons with Mild Upper Respiratory Symptoms on a Cruise Ship, Japan","We describe 2 cases of COVID-19 in patients with mild upper respiratory symptoms. Both patients worked on a cruise ship quarantined off the coast of Japan. One patient had persistent, low-grade up...",http://doi.org/10.3201/eid2606.200452,"Arashiro, Takeshi; Furukawa, Keiichi; Nakamura, Akira",Emerging infectious diseases,19.385938
1,From the frontlines of COVID-19 – How prepared are we as obstetricians: a commentary,Abstract The World Health Organization (WHO) has declared the outbreak of novel coronavirus (2019-nCoV) ? now known as Coronavirus Disease (COVID-19)1 - as a global health emergency. Singapore cur...,http://doi.org/10.1111/1471-0528.16192,"Chua, Monica Shi Qi; Lee, Jill Cheng Sim; Sulaiman, Suzanna; Tan, Hak Koon",BJOG: An International Journal of Obstetrics & Gynaecology,18.493467
2,COVID-19—New Insights on a Rapidly Changing Epidemic,"Since first reported in Wuhan, China, in late December 2019, the outbreak of the novel coronavirus now known as SARS-CoV-2 (severe acute respiratory syndrome coronavirus 2) has spread globally. As...",http://doi.org/10.1001/jama.2020.3072,"del Rio, Carlos; Malani, Preeti N.",JAMA,17.679552
3,Backcalculating the Incidence of Infection with COVID-19 on the Diamond Princess,"To understand the time-dependent risk of infection on a cruise ship, the Diamond Princess, I estimated the incidence of infection with novel coronavirus (COVID-19). The epidemic curve of a total o...",http://doi.org/10.3390/jcm9030657,"Nishiura, Hiroshi",J Clin Med,15.195548


In [29]:
results[3].title()

'Backcalculating the Incidence of Infection with COVID-19 on the Diamond Princess'

In [30]:
tasks = [('What is known about transmission, incubation, and environmental stability?', 
        'transmission incubation environment coronavirus'),
        ('What do we know about COVID-19 risk factors?', 'risk factors'),
        ('What do we know about virus genetics, origin, and evolution?', 'genetics origin evolution'),
        ('What has been published about ethical and social science considerations','ethics ethical social'),
        ('What do we know about diagnostics and surveillance?','diagnose diagnostic surveillance'),
        ('What has been published about medical care?', 'medical care'),
        ('What do we know about vaccines and therapeutics?', 'vaccines vaccine vaccinate therapeutic therapeutics')] 
tasks = pd.DataFrame(tasks, columns=['Task', 'Keywords'])

In [31]:
# Research Papers for each task
def show_task(Task):
    print(Task)
    keywords = tasks[tasks.Task == Task].Keywords.values[0]
    search_results = bm25_index.search(keywords, n=200)
    return search_results
    
results = interact(show_task, Task = tasks.Task.tolist());

interactive(children=(Dropdown(description='Task', options=('What is known about transmission, incubation, and…

In [32]:
# Creating an Autocomplete Search bar
from IPython.display import display

def search_papers(SearchTerms: str):
    search_results = bm25_index.search(SearchTerms, n=10)
    if len(search_results) > 0:
        display(search_results) 
    return search_results

searchbar = widgets.interactive(search_papers, SearchTerms='cruise ship')
searchbar

interactive(children=(Text(value='cruise ship', description='SearchTerms'), Output()), _dom_classes=('widget-i…

In [33]:
searchbar.result[0]

Unnamed: 0,Value
title,"COVID-19 in 2 Persons with Mild Upper Respiratory Symptoms on a Cruise Ship, Japan"
abstract,"We describe 2 cases of COVID-19 in patients with mild upper respiratory symptoms. Both patients worked on a cruise ship quarantined off the coast of Japan. One patient had persistent, low-grade up..."
doi,http://doi.org/10.3201/eid2606.200452
authors,"Arashiro, Takeshi; Furukawa, Keiichi; Nakamura, Akira"
journal,Emerging infectious diseases
Score,19.3859
