In [3]:
import sys
sys.path.append('../')

import random
import numpy as np
from corona_nlp.indexing import PaperIndexing, biorxiv_medrxiv
from corona_nlp.preprocessing import load_papers_with_text

In [4]:
biomed = PaperIndexing(biorxiv_medrxiv)
print(biomed)

<COVID-19(biorxiv_medrxiv, papers=803)>


In [5]:
# obtain five randomly chosen papers from the biomed directory
samples = list(range(1, biomed.num_papers))
indices = random.sample(samples, k=5)

# load each paper and its content from abstract and body_text keys
batch = load_papers_with_text(covid=biomed,
                              indices=indices,
                              keys=('abstract', 'body_text'))

print(f'number of papers: {len(batch)}')
print(f'keys: {batch[0].keys()}')

number of papers: 5
keys: dict_keys(['id', 'title', 'texts'])


In [6]:
# text size for all papers (paper-id, text-length)
[(p, len(batch[i]['texts'])) for i, p in enumerate(indices)]

[(557, 61), (342, 62), (705, 29), (49, 35), (369, 42)]

### Document Similarity

> Find the most similar titles given a query (e.g., title, keywords)

In [10]:
from david.cosine import SimilarDocuments

k = 802
samples = list(range(1, biomed.num_papers))
clusters = random.sample(samples, k)
papers = biomed.load_papers(clusters)

titles_cluster = []
for paper in papers:
    title = paper['metadata']['title']
    titles_cluster.append(title)

sd = SimilarDocuments(titles_cluster, ngram=(1, 3))
sd.learn_vocab()
title_query = titles_cluster[5]
print('title:', title_query)

title: CORRELATION OF MRNA DELIVERY TIMING AND PROTEIN EXPRESSION IN LIPID-BASED TRANSFECTION


In [23]:
# lets find the most similar titles to the following tasks 
# posted on the kaggle challege
task_queries = ['Smoking, pre-existing pulmonary disease',
                'Co-infections (determine whether co-existing respiratory/viral '
                'infections make the virus more transmissible or virulent) and other co-morbidities',
                'Neonates and pregnant women',
                'Socio-economic and behavioral factors to understand the economic impact of the virus '
                'and whether there were differences.']

top_k = 2  # number of similar titles per query (2x4=8:queries)
sd.add_query(task_queries, clear_first=True)
similar_clusters = list(sd.iter_similar(top_k))

similar_papers = []
for idx in similar_clusters:
    paper_id, score, title = (idx['doc_id'].tolist(),
                              idx['sim'], idx['text'])
    similar_papers.append(paper_id)
    print(f'paper_id={paper_id}, score={score}\n * title: {title}\n')

paper_id=572, score=0.1286
 * title: Transmission interval estimates suggest pre-symptomatic spread of COVID-19

paper_id=555, score=0.1207
 * title: Existing host range mutations constrain further emergence of RNA viruses

paper_id=95, score=0.149
 * title: Attenuation of influenza A virus disease severity by viral co-infection in a mouse model 2 3 Short Title: Pathogenesis of influenza viral co-infection 4 5

paper_id=96, score=0.1239
 * title: Chronic infections can shape epidemic exposure: Pathogen co-occurrence networks in the Serengeti lions

paper_id=150, score=0.16
 * title: SARS-CoV-2 and SARS-CoV Spike-RBD Structure and Receptor Binding Comparison and Potential Implications on Neutralizing Antibody and Vaccine Development

paper_id=51, score=0.1172
 * title: CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID REVIEW AND EXPERT CONSENSUS

paper_id=564, score=0.126
 * title: Decoupling the effects of nutrition, age and behavioral caste 2 on honey bee physiology and immunity

paper_id=

> Similar sentences relative to the list of top-k similar-papers:

In [31]:
similar_sentences = []
# load all the sentences from the paper indices (paper-ids)
for batch in load_papers_with_text(biomed, similar_papers):
    for texts in batch['texts']:
        similar_sentences.append(texts)

print(f'* sentences: {len(similar_sentences)}, * text:')
print(similar_sentences[0])

* sentences: 272, * text:
Background: Imagery techniques have been used as essential parts of diagnostic workup for patients suspected for 2019-nCoV infection, Multiple studies have reported the features of chest computed tomography (CT) scans among a number of 2019-nCoV patients. Method: Study Identification was carried out in databases (PubMed, Embase and Cochrane Library) to identify published studies examining the diagnosis, the 2019 novel coronavirus (2019-nCoV). Heterogeneity among reported prevalence was assessed by computing p-values of Cochrane Q-test and I 2 -statics. The pooled prevalence of treatment failure was carried out with a fixed effects meta-analysis model, generating the pooled 95% confidence interval. A random-effect model was used to pool the results since this model could incorporate the heterogeneity of the studies and therefore proved a more generalized result.


## Question Answering

---

- Sentence similary computation:

    - metric : cosine
    - matrix : TFIDF

> Let's now use the sentences and `QAAM` (based on Transformers Q&A pipeline and really basic metrics!) to fetch queries in question "form":

In [74]:
from qaam import QAAM

qaam = QAAM(0.1, top_k=60, metric='cosine', mode='tfidf', lemmatize=False)
qaam.texts_from_doc(similar_sentences)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [75]:
entities = qaam.common_entities(None, lower=True, lemma=True)
entities[:10]

[('pol ii', 55),
 ('china', 54),
 ('cc - by - nc - nd 4.0 international', 39),
 ('2', 35),
 ('one', 30),
 ('1', 29),
 ('3', 26),
 ('two', 24),
 ('first', 23),
 ('φx174', 21)]

In [77]:
qaam.answer("What are some pre-existing pulmonary disease risk factors?", render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 244.12it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 1863.31it/s]


In [80]:
question = "What reports are similar to case 23 and case 24 reports?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 83.31it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 4262.50it/s]


In [89]:
question = "Why images important elements of clinical workflow?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 82.43it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2078.45it/s]


In [88]:
question = "Is there any economic impact of the virus?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 87.92it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2057.04it/s]


In [86]:
question = "What are the transmission dynamics of the virus?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 82.25it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2656.30it/s]


In [90]:
question = "What measures could be effective for controling the virus?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 104.32it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2659.67it/s]


In [92]:
question = "Do existing infections make COVID-19 more viral?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 148.86it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 3688.92it/s]


### Clustering with Scikit-Learn and SentenceTransformers

> Clustering using a transformers wrapper API `SentenceTransformers` and scikit-learn's `AgglomerativeClustering`

In [97]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering

embedder = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [00:40<00:00, 10.1MB/s] 


In [98]:
k = 802
samples = list(range(1, biomed.num_papers))
clusters = random.sample(samples, k)
papers = biomed.load_papers(clusters)

titles_cluster = []
for paper, cluster in zip(papers, clusters):
    title = paper['metadata']['title']
    titles_cluster.append((title, cluster))

titles, clusters = zip(*titles_cluster)
num_clusters = len(set(titles_cluster))
titles_embedding = embedder.encode(titles)

ac = AgglomerativeClustering(n_clusters=num_clusters)
ac.fit(titles_embedding)
cluster_assignment = ac.labels_

In [103]:
clustered_titles = [[] for i in range(num_clusters)]
for title_id, cluster_id in enumerate(cluster_assignment):
    title_id = titles_cluster[title_id]
    clustered_titles[cluster_id].append(title_id)
    
for i, cluster in enumerate(clustered_titles[:20]):
    print(f'cluster<({i + 1})>')
    for row in cluster:
        print(f'paper_id: {row[1]} - {row[0]}\n')

cluster<(1)>
paper_id: 613 - The network structure and eco-evolutionary dynamics of CRISPR-induced immune diversification

cluster<(2)>
paper_id: 292 - Recombination and convergent evolution led to the emergence of 2019 Wuhan coronavirus

cluster<(3)>
paper_id: 284 - Catalysis, inhibition and dynamics of Zika NS2B-NS3pro Unique properties of Zika NS2B-NS3pro complexes as decoded by experiments and MD simulations Catalysis, inhibition and dynamics of Zika NS2B-NS3pro

cluster<(4)>
paper_id: 298 - The role of post-Golgi transport pathways and sorting motifs in the plasmodesmal targeting of the movement protein (MP) of Ourmia melon virus (OuMV)

cluster<(5)>
paper_id: 149 - Machine learning using intrinsic genomic signatures for rapid classification of novel pathogens: COVID-19 case study

cluster<(6)>
paper_id: 793 - Homologous Recombination as an Evolutionary Force in 1 African Swine Fever Viruses

cluster<(7)>
paper_id: 222 - Single-cell RNA expression profiling of ACE2, the putative r

### K-mean clustering and SentenceTransformers

In [104]:
from sklearn.cluster import KMeans

titles_embeddings = embedder.encode(titles)

n_clusters = 5
clustering_model = KMeans(n_clusters=n_clusters)
clustering_model.fit(titles_embeddings)
cluster_assignment = clustering_model.labels_

clustered_titles = [[] for i in range(n_clusters)]
for title_id, cluster_id in enumerate(cluster_assignment):
    cluster = titles_cluster[title_id]
    clustered_titles.append(cluster)

for i, cluster in enumerate(clustered_titles[:20]):
    if cluster:
        title, paper_id = cluster[0], cluster[1]      
        print(f'<cluster={i+1}, paper_id={paper_id}>')
        print(f'title: {title}')

<cluster=6, paper_id=768>
title: Realtime 2-5A kinetics suggests interferons β and  evade global arrest of translation by RNase L
<cluster=7, paper_id=330>
title: Identification of the relative timing of infectiousness 1 and symptom onset for outbreak control 2
<cluster=8, paper_id=152>
title: Title: Interventions targeting air travellers early in the pandemic may delay local outbreaks of SARS-CoV-2
<cluster=9, paper_id=663>
title: Contacts in context: large-scale setting-specific social mixing matrices from the BBC Pandemic project
<cluster=10, paper_id=224>
title: Structure-Guided Mutagenesis Alters Deubiquitinating Activity 2 and Attenuates Pathogenesis of a Murine Coronavirus
<cluster=11, paper_id=387>
title: Metagenomic Nanopore sequencing of influenza virus 1 direct from clinical respiratory samples 2 3
<cluster=12, paper_id=736>
title: Pangolin homology associated with 2019-nCoV
<cluster=13, paper_id=661>
title: The influenza A virus endoribonuclease PA-X usurps host mRNA proce