<p align="center">
  <img width="1000" height="250" src="https://inlovewithcode.files.wordpress.com/2017/02/screen-shot-2017-02-27-at-5-35-14-pm.png?w=1032&h=&zoom=2"> 
</p>

In [None]:
pip install scispacy

In [3]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import scispacy
import spacy

from scipy.spatial.distance import jensenshannon

import joblib

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from IPython.display import HTML, display

from ipywidgets import interact, Layout, HBox, VBox
import ipywidgets as widgets
from IPython.display import clear_output

from tqdm import tqdm

In [6]:
# metadata
sources = pd.read_csv('2020-03-13/all_sources_metadata_2020-03-13.csv')

sources.drop_duplicates(subset=['sha'], inplace=True)

def doi_url(d):
    if d.startswith('http://'):
        return d
    elif d.startswith('doi.org'):
        return f'http://{d}'
    else:
        return f'http://doi.org/{d}'

sources.doi = sources.doi.fillna('').apply(doi_url)

papers = pd.read_csv("2020-03-13/covid19-medical-paperscsv/kaggle_covid-19_open_csv_format.csv")
papers = papers.iloc[1:, 1:].reset_index(drop=True)

# merge frames
cols_to_use = sources.columns.difference(papers.columns)
all_data = pd.merge(papers, sources[cols_to_use], left_on='doc_id', right_on='sha', how='left')

all_data.title = all_data.title.astype(str) # change to string, there are also some numeric values

all_data.head(2)

Unnamed: 0,doc_id,source,title,abstract,text_body,Microsoft Academic Paper ID,WHO #Covidence,authors,doi,has_full_text,journal,license,pmcid,publish_time,pubmed_id,sha,source_x
0,30457ee3ce0001f33938fbc246b4ce4eacd74f5d,BIORXIV,ZODIAC: database-independent molecular formula...,The condent high-throughput identication of sm...,"For all ve datasets, we observe that ZODIAC ou...",,,"Ludwig, M.; Nothias, L.-F.; Dührkop, K.; Koes...",http://doi.org/10.1101/842740,True,,See https://www.biorxiv.org/about-biorxiv,,2019-11-16,,30457ee3ce0001f33938fbc246b4ce4eacd74f5d,biorxiv
1,5796fdc80a85b6e634a279c57711356055b65880,BIORXIV,Title: Close contacts and household transmissi...,,Confirmed cases of infection with the SARS-CoV...,,,,http://doi.org/10.1101/2020.03.02.20029868,True,,See https://www.medrxiv.org/submit-a-manuscript,,,,5796fdc80a85b6e634a279c57711356055b65880,medrxiv


In [8]:
all_texts = all_data.text_body

In [9]:
# example snippet
all_texts[0][:500]

'For all ve datasets, we observe that ZODIAC outperforms SIRIUS, often substantially decreasing molecular formula annotation error rates (Fig. 1, left) . We rst consider the dendroides dataset, for which improvements are most distinctive: This dataset contains many larger compounds, and 75 % of the ground truth compounds have an m/z of 605 or higher ( Supplementary Fig. 6 ).\n Hence, this dataset is particularly challenging for molecular formula assignment. Out of the 201 ground truth compounds, t'

### Latend Dirichlet Allocation:

In [None]:
# medium model
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz
    
pip install en_core_sci_md

In [14]:
# medium model
import en_core_sci_md
nlp = en_core_sci_md.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 2000000

In [15]:
def spacy_tokenizer(sentence):
    return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space)] # remove numbers (e.g. from references [1], etc.)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [16]:
# New stop words list 
customize_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'fig', 'fig.', 'al.',
    'di', 'la', 'il', 'del', 'le', 'della', 'dei', 'delle', 'una', 'da',  'dell',  'non', 'si'
]

# Mark them as stop words
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

In [17]:
tf_vectorizer = CountVectorizer(tokenizer = spacy_tokenizer)
tf = tf_vectorizer.fit_transform(tqdm(all_texts))
tf.shape

100%|████████████████████████████████████████████████████████████████████████████| 13202/13202 [10:26<00:00, 22.09it/s]


(13202, 641988)

In [18]:
joblib.dump(tf_vectorizer, 'tf_vectorizer.csv')
joblib.dump(tf, 'tf.csv')

['tf.csv']

In [19]:
tf_vectorizer = joblib.load('tf_vectorizer.csv')
tf = joblib.load('tf.csv')

In [20]:
# most frequent words
word_count = pd.DataFrame({'word': tf_vectorizer.get_feature_names(), 'count': np.asarray(tf.sum(axis=0))[0]})

word_count.sort_values('count', ascending=False).set_index('word')[:20].sort_values('count', ascending=True).plot(kind='barh')

<matplotlib.axes._subplots.AxesSubplot at 0x2935cb6f320>

In [23]:
# this takes long! you can skip this and load the model from the output file
lda_tf = LatentDirichletAllocation(n_components=50, random_state=0, n_jobs=-1) # TODO: tune n_components: how many topics make sense?
lda_tf.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=50, n_jobs=-1,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [24]:
stop = timeit.default_timer()
print(datetime.now()-start1)

Time:  511.2954499000007
0:08:31.304491


In [25]:
joblib.dump(lda_tf, 'lda.csv')

['lda.csv']

In [27]:
# Load from output file
lda_tf = joblib.load('lda.csv') 

### Discovered Topics:

In [28]:
tfidf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda_tf, tfidf_feature_names, 25)


Topic #0: cell infection mouse lung response expression cytokine level increase virus macrophage infect viral immune inflammatory study show disease neutrophil inflammation human airway epithelial type induce

Topic #1: cell virus replication culture lasv show vimentin lipid result viral effect fusion assay membrane infect medium observe acid study line treatment level infection contain rna

Topic #2: datum method test system pathogen set analysis result provide sample database detection information performance base user tool use sensitivity approach identify available include search allow

Topic #3: model datum numb time r case estimate = value individual rate parameter network t transmission distribution epidemic result method population contact probability infection different infect

Topic #4: s. bacterium bacterial p. aureus b. oil strain mrsa antimicrobial reaction product light phage suis activity pathogen aeruginosa synthesis plant show result l. essential antibacterial

Topic 

Topic #44: sample pcr assay detection detect primer test virus dna positive rna c result study reaction method amplification rt-pcr sequence viral specimen ° clinical perform sensitivity

Topic #45: host species virus genome population genetic sequence pathogen strain tree analysis rate mutation evolutionary datum viral diversity evolution size model selection numb study human phylogenetic

Topic #46: bind virus ebov hiv-1 receptor gp entry infection cell hiv lectin viral glycoprotein surface human antibody show gp120 fusion neutralize acid activity study target glycans

Topic #47: de facemask à pro en tbsv mnv que eef1a y el des les 3cl norovirus ly6e app para los 3c raft hla-a o un se

Topic #48: particle surface concentration sample air temperature nanoparticle aerosol size high solution time increase device filter method result water show flow nm material nps system condition

Topic #49: exposure ar formaldehyde behavior intention bladder cuprizone muscle behavioral group study cam

In [None]:
viz = pyLDAvis.sklearn.prepare(lda_tf, tf, tf_vectorizer)
pyLDAvis.display(viz)
pyLDAvis.save_html(viz, 'lda.html')

<p align="center">
  <img width="1000" height="250" src="pyLDAvis.JPG"> 
</p>

In [65]:
# slow, load from output file instead
topic_dist = pd.DataFrame(lda_tf.transform(tf))

In [66]:
topic_dist.to_csv('topic_dist.csv', index=False)

In [67]:
# Load from output file
topic_dist = pd.read_csv('topic_dist.csv')
topic_dist.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,4e-06,4e-06,0.037869,0.460793,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06,...,4e-06,4e-06,4e-06,4e-06,0.007218,4e-06,4e-06,4e-06,0.005632,4e-06
1,4.7e-05,4.7e-05,4.7e-05,0.074049,4.7e-05,4.7e-05,4.7e-05,4.7e-05,4.7e-05,4.7e-05,...,0.266759,4.7e-05,4.7e-05,4.7e-05,4.7e-05,4.7e-05,4.7e-05,4.7e-05,4.7e-05,4.7e-05
2,2.4e-05,2.4e-05,2.4e-05,0.020718,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,...,2.4e-05,0.347702,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05,2.4e-05
3,1e-05,1e-05,1e-05,0.009038,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,...,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05
4,1.2e-05,1.2e-05,1.2e-05,0.396789,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,...,0.055275,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05


In [68]:
topic_dist.shape

(13202, 50)

### "Nearest" Papers (in Topic Space):

In [69]:
def get_k_nearest_docs(doc_dist, k=5, use_jensenshannon=True):
    '''
    doc_dist: topic distribution (sums to 1) of one article
    
    Returns the index of the k nearest articles (as by Jensen–Shannon divergence/ Euclidean distance in topic space). 
    '''
    
    if use_jensenshannon:
        distances = topic_dist.apply(lambda x: jensenshannon(x, doc_dist), axis=1)
    else:
        diff_df = topic_dist.sub(doc_dist)
        distances = np.sqrt(np.square(diff_df).sum(axis=1)) # euclidean distance (faster)
        
    return distances[distances != 0].nsmallest(n=k).index

### Search related papers to a chosen one:

In [70]:
def recommendation(doc_id, k=5):
    '''
    Returns the title of the k papers that are closest (topic-wise) to the paper given by paper_id.
    '''
    
    print(all_data.title[all_data.doc_id == doc_id].values[0])
    recommended = get_k_nearest_docs(topic_dist[all_data.doc_id == doc_id].iloc[0], k)
    recommended = all_data.iloc[recommended]
    
    h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' for l, n in recommended[['doi','title']].values])
    display(HTML(h))

In [71]:
start1 = datetime.now()

In [72]:
recommendation('a137eb51461b4a4ed3980aa5b9cb2f2c1cf0292a', k=5)

The effect of inhibition of PP1 and TNFα signaling on pathogenesis of SARS coronavirus


In [73]:
recommendation('90b5ecf991032f3918ad43b252e17d1171b4ea63', k=5)

The role of absolute humidity on transmission rates of the COVID-19 outbreak


In [74]:
recommendation('c04c7fb330a409a00f67040dde0f83b3da88eacb', k=5)

Potential inhibitors for 2019-nCoV coronavirus M protease from clinically approved medicines


In [75]:
recommendation('36521caf90f471c9da1a4e84f8562440d73ead9a', k=10)

Estimation of the epidemic properties of the 2019 novel coronavirus: A mathematical modeling study


In [76]:
print(datetime.now()-start1)

0:00:15.058968


### Widget: Pick a COVID-19-Paper:
Without doc_id

In [77]:
def related_papers():
    '''
    Creates a widget where you can select one of many papers about covid-19 and then displays related articles from the whole dataset.
    '''
    covid_papers = all_data[all_data.text_body.str.contains('COVID-19|SARS-CoV-2|2019-nCov')][['doc_id', 'title']] # are there are names?
    title_to_id = covid_papers.set_index('title')['doc_id'].to_dict()
    
    def main_function(bullet, k=5):
        recommendation(title_to_id[bullet], k)

    kWidget = widgets.IntSlider(value=10, description='k', max=50, min=1)

    bulletW = widgets.Select(options=title_to_id.keys(), layout=Layout(width='80%', height='200px'), description='Paper:')

    widget = widgets.interactive(main_function, bullet=bulletW, k=kWidget)

    display(widget)

In [78]:
related_papers()

interactive(children=(Select(description='Paper:', layout=Layout(height='200px', width='80%'), options=("Title…

### Browse topics:

In [79]:
def relevant_articles(tasks, k=3):
    tasks = [tasks] if type(tasks) is str else tasks 
    
    tasks_tf = tf_vectorizer.transform(tasks)
    tasks_topic_dist = pd.DataFrame(lda_tf.transform(tasks_tf))

    for index, bullet in enumerate(tasks):
        print(bullet)
        recommended = get_k_nearest_docs(tasks_topic_dist.iloc[index], k)
        recommended = all_data.iloc[recommended]

        h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' for l, n in recommended[['doi','title']].values])
        display(HTML(h))

In [80]:
## What is known about transmission, incubation, and environmental stability?
task1 = ["Range of incubation periods for the disease in humans (and how this varies across age and health status) and how long individuals are contagious, even after recovery.",
                            "Prevalence of asymptomatic shedding and transmission (e.g., particularly children).",
                            "Seasonality of transmission.",
                            "Physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phobic surfaces, environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding).",
                            "Persistence and stability on a multitude of substrates and sources (e.g., nasal discharge, sputum, urine, fecal matter, blood).",
                            "Persistence of virus on surfaces of different materials (e,g., copper, stainless steel, plastic).",
                            "Natural history of the virus and shedding of it from an infected person",
                            "Implementation of diagnostics and products to improve clinical processes",
                            "Disease models, including animal models for infection, disease and transmission",
                            "Tools and studies to monitor phenotypic change and potential adaptation of the virus",
                            "Immune response and immunity",
                            "Effectiveness of movement control strategies to prevent secondary transmission in health care and community settings",
                            "Effectiveness of personal protective equipment (PPE) and its usefulness to reduce risk of transmission in health care and community settings",
                            "Role of the environment in transmission"
                           ]

In [81]:
relevant_articles(task1, 5)

Range of incubation periods for the disease in humans (and how this varies across age and health status) and how long individuals are contagious, even after recovery.


Prevalence of asymptomatic shedding and transmission (e.g., particularly children).


Seasonality of transmission.


Physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phobic surfaces, environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding).


Persistence and stability on a multitude of substrates and sources (e.g., nasal discharge, sputum, urine, fecal matter, blood).


Persistence of virus on surfaces of different materials (e,g., copper, stainless steel, plastic).


Natural history of the virus and shedding of it from an infected person


Implementation of diagnostics and products to improve clinical processes


Disease models, including animal models for infection, disease and transmission


Tools and studies to monitor phenotypic change and potential adaptation of the virus


Immune response and immunity


Effectiveness of movement control strategies to prevent secondary transmission in health care and community settings


Effectiveness of personal protective equipment (PPE) and its usefulness to reduce risk of transmission in health care and community settings


Role of the environment in transmission


In [82]:
## What do we know about COVID-19 risk factors?
task2 = ['Data on potential risks factors',
'Smoking, pre-existing pulmonary disease',
'Co-infections (determine whether co-existing respiratory/viral infections make the virus more transmissible or virulent) and other co-morbidities',
'Neonates and pregnant women',
'Socio-economic and behavioral factors to understand the economic impact of the virus and whether there were differences.',
'Transmission dynamics of the virus, including the basic reproductive number, incubation period, serial interval, modes of transmission and environmental factors', 
'Severity of disease, including risk of fatality among symptomatic hospitalized patients, and high-risk patient groups',
'Susceptibility of populations',
'Public health mitigation measures that could be effective for control']

In [83]:
relevant_articles(task2, 5)

Data on potential risks factors


Smoking, pre-existing pulmonary disease


Co-infections (determine whether co-existing respiratory/viral infections make the virus more transmissible or virulent) and other co-morbidities


Neonates and pregnant women


Socio-economic and behavioral factors to understand the economic impact of the virus and whether there were differences.


Transmission dynamics of the virus, including the basic reproductive number, incubation period, serial interval, modes of transmission and environmental factors


Severity of disease, including risk of fatality among symptomatic hospitalized patients, and high-risk patient groups


Susceptibility of populations


Public health mitigation measures that could be effective for control


In [None]:
## What do we know about virus genetics, origin, and evolution?

In [84]:
task3 = ['Real-time tracking of whole genomes and a mechanism for coordinating the rapid dissemination of that information to inform the development of diagnostics and therapeutics and to track variations of the virus over time.',
'Access to geographic and temporal diverse sample sets to understand geographic distribution and genomic differences, and determine whether there is more than one strain in circulation. Multi-lateral agreements such as the Nagoya Protocol could be leveraged.',
'Evidence that livestock could be infected (e.g., field surveillance, genetic sequencing, receptor binding) and serve as a reservoir after the epidemic appears to be over.',
'Evidence of whether farmers are infected, and whether farmers could have played a role in the origin.',
'Surveillance of mixed wildlife- livestock farms for SARS-CoV-2 and other coronaviruses in Southeast Asia.',
'Experimental infections to test host range for this pathogen.',
'Animal host(s) and any evidence of continued spill-over to humans',
'Socioeconomic and behavioral risk factors for this spill-over',
'Sustainable risk reduction strategies']

In [85]:
relevant_articles(task3, 5)

Real-time tracking of whole genomes and a mechanism for coordinating the rapid dissemination of that information to inform the development of diagnostics and therapeutics and to track variations of the virus over time.


Access to geographic and temporal diverse sample sets to understand geographic distribution and genomic differences, and determine whether there is more than one strain in circulation. Multi-lateral agreements such as the Nagoya Protocol could be leveraged.


Evidence that livestock could be infected (e.g., field surveillance, genetic sequencing, receptor binding) and serve as a reservoir after the epidemic appears to be over.


Evidence of whether farmers are infected, and whether farmers could have played a role in the origin.


Surveillance of mixed wildlife- livestock farms for SARS-CoV-2 and other coronaviruses in Southeast Asia.


Experimental infections to test host range for this pathogen.


Animal host(s) and any evidence of continued spill-over to humans


Socioeconomic and behavioral risk factors for this spill-over


Sustainable risk reduction strategies


### Widget- Pick a Task:

In [86]:
def relevant_articles_for_task():
    tasks={'Task 1': task1,'Task 2': task2, 'Task 3': task3}

    def main_function(bullet, task, k=5):
        relevant_articles([bullet], k)
        bulletW.options = tasks[task]    

    kWidget = widgets.IntSlider(value=5, description='k', max=30, min=1)

    taskW = widgets.Select(options=tasks.keys(), description='Task:')
    init = taskW.value
    bulletW = widgets.Select(options=tasks[init], layout=Layout(width='60%', height='200px'), description='Bullet:')

    widget = widgets.interactive(main_function, task=taskW, bullet=bulletW, k=kWidget)

    controls = HBox([VBox([widget.children[:-1][1], widget.children[:-1][2]]), widget.children[:-1][0]], layout = Layout(flex_flow='row wrap'))
    output = widget.children[-1]
    display(VBox([controls, output]))

In [87]:
relevant_articles_for_task()

VBox(children=(HBox(children=(VBox(children=(Select(description='Task:', options=('Task 1', 'Task 2', 'Task 3'…

### Widget- Free Text Search:

In [88]:
def relevant_articles_for_text():    
    textW = widgets.Textarea(
        value='',
        placeholder='Type something',
        description='Text:',
        disabled=False,
        layout=Layout(width='80%', height='200px')
    )

    kWidget = widgets.IntSlider(value=15, description='k', max=30, min=1)

    button = widgets.Button(description="Search")

    display(VBox([textW, HBox([kWidget, button])]))

    def on_button_clicked(b):
        clear_output()
        display(VBox([textW, HBox([kWidget, button])]))
        relevant_articles(textW.value, kWidget.value)

    button.on_click(on_button_clicked)

In [89]:
relevant_articles_for_text()

VBox(children=(Textarea(value='coronavirus medicine treatment cure', description='Text:', layout=Layout(height…

coronavirus medicine treatment cure


In [None]:
stop = timeit.default_timer()
print(datetime.now()-start1)