In [1]:
from typing import List

import random
import numpy as np
import pandas as pd
from conv19_papers import CovidPapers

# sources
biorxiv_medrxiv = 'CORD-19-research-challenge/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/'
comm_use_subset = 'CORD-19-research-challenge/2020-03-13/comm_use_subset/comm_use_subset/'

In [2]:
biomed = CovidPapers(biorxiv_medrxiv)
biomed

<COVID-19(biorxiv_medrxiv, papers=803)>

In [3]:
def load_papers_with_text(conv: CovidPapers, indices: List[int]):
    """For every paper grab its title and all the available texts."""
    if not isinstance(conv, CovidPapers):
        raise ValueError(
            f'{type(conv)} is not an instance of CovidPapers.')

    batch = []
    papers = conv.load_papers(indices=indices)
    for i, paper in zip(indices, papers):
        title = paper['metadata']['title']
        texts = []
        for key in ('abstract', 'body_text', 'back_matter'):
            sequences = [x['text'] for x in paper[key]]
            for string in sequences:
                if len(string) == 0 and string in texts:
                    continue
                texts.append(string)
        batch.append({'id': i, 'title': title, 'texts': texts})
    return batch

In [4]:
# lets start with 5 random papers (samples)
samples = list(range(1, biomed.num_papers))
indices = random.sample(samples, k=5)

# each paper is loaded with its title and all texts
batch = load_papers_with_text(biomed, indices)

print(f'number of papers: {len(batch)}')
print(f'keys: {batch[0].keys()}')

number of papers: 5
keys: dict_keys(['id', 'title', 'texts'])


In [5]:
# number of text documents per paper index
[(p, len(batch[i]['texts'])) for i, p in enumerate(indices)]

[(520, 29), (726, 43), (476, 18), (637, 15), (753, 34)]

In [8]:
from david.cosine import SimilarDocuments

max_k = 802
samples = list(range(1, biomed.num_papers))
clusters = random.sample(samples, max_k)
papers = biomed.load_papers(clusters)

titles_cluster = []
for paper in papers:
    title = paper['metadata']['title']
    titles_cluster.append(title)

sd = SimilarDocuments(titles_cluster, ngram=(1, 3))
sd.learn_vocab()

# lets find the most similar titles given one title from the cluster
title_query = titles_cluster[5]
print(title_query)

Clinical Characteristics on 25 Discharged Patients with COVID-19 Virus Returning


In [25]:
task_1 = [
    'Smoking, pre-existing pulmonary disease',
    'Co-infections (determine whether co-existing respiratory/viral infections make the virus more transmissible or virulent) and other co-morbidities',
    'Neonates and pregnant women',
    'Socio-economic and behavioral factors to understand the economic impact of the virus and whether there were differences.',]

sd.add_query(task_1, clear_first=True)

In [26]:
top_k = 40
similar_clusters = list(sd.iter_similar(top_k+1))

similar_papers = []
for c in similar_clusters[1: top_k+1]:
    paper_id, score, title = c['doc_id'], c['sim'], c['text']
    paper_id = paper_id.tolist() # items are instance of numpy.int64
    if paper_id not in similar_papers:
        similar_papers.append(paper_id)
    print('<paper_id={}, score={}>\n * title: {}\n'.format(paper_id, score, title))

sd.clear_queries()

<paper_id=653, score=0.1207>
 * title: Existing host range mutations constrain further emergence of RNA viruses

<paper_id=537, score=0.0971>
 * title: TITLE: Pulmonary Metagenomic Sequencing Suggests Missed Infections in Immunocompromised AFFILIATIONS

<paper_id=274, score=0.0946>
 * title: Title: The impact of persistent bacterial bronchitis on the pulmonary microbiome of

<paper_id=482, score=0.071>
 * title: Aberrant pathogenic GM-CSF + T cells and inflammatory CD14 + CD16 + monocytes 1 in severe pulmonary syndrome patients of a new coronavirus 2 3

<paper_id=385, score=0.039>
 * title: Case fatality rate of novel coronavirus disease 2019 in China 1

<paper_id=91, score=0.0382>
 * title: Acute Myocardial Injury of Patients with Coronavirus Disease 2019

<paper_id=575, score=0.0342>
 * title: Hypokalemia and Clinical Implications in Patients with Coronavirus Disease 2019 (COVID-19)

<paper_id=513, score=0.0342>
 * title: Association of Population migration and Coronavirus Disease 20

In [27]:
similar_batch = load_papers_with_text(biomed, similar_papers)

In [28]:
# flatten list of string sequences from all the papers
similar_texts = []
for batch in similar_batch:
    for texts in batch['texts']:
        similar_texts.append(texts)

In [29]:
similar_texts[0]

'Nothing to declare.'

In [13]:
from qaam import QAAM

Using TensorFlow backend.


In [40]:
qaam = QAAM(.1, top_k=20, metric='cosine', mode='tfidf')

qaam.texts_from_doc(similar_texts)
entities = qaam.common_entities(None, lower=True, lemma=True)
entities[:10]

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




[('1', 286),
 ('2', 205),
 ('3', 169),
 ('two', 155),
 ('zika', 146),
 ('rna', 125),
 ('first', 124),
 ('4', 123),
 ('5', 121),
 ('wuhan', 119)]

In [41]:
question = "What are the symptoms in relation to death?"
qaam.answer(question, render=True)

[38;5;4mℹ * Removed 0 tokens from 9680[0m


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 91.82it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2822.55it/s]


In [42]:
question = "What are the pre-existing pulmonary disease risk factors?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 98.12it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2003.97it/s]


In [43]:
question = "Do viral infections make the virus more transmissible?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 77.99it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 5115.00it/s]


In [44]:
question = "What is the economic impact of the virus?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 50.63it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 4275.54it/s]


In [47]:
question = "What about transmission dynamics of the virus?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 86.65it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 1898.73it/s]


In [48]:
question = "What measures could be effective for control?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 73.06it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 3901.68it/s]


In [69]:
question = "What about Neonates and pregnant women?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 50.01it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 5957.82it/s]


In [75]:
question = "Do existing infections make COVID-19 more viral?"
qaam.answer(question, render=True)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 120.31it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 2126.93it/s]


## Clustering 2

> Clustering using a transformers wrapper API `SentenceTransformers` and scikit-learn's `AgglomerativeClustering`

In [49]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering

embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

100%|██████████| 405M/405M [01:35<00:00, 4.25MB/s] 


In [50]:
max_k = 802
samples = list(range(1, biomed.num_papers))
clusters = random.sample(samples, max_k)
papers = biomed.load_papers(clusters)

titles_cluster = []
for paper, cluster in zip(papers, clusters):
    title = paper['metadata']['title']
    titles_cluster.append((title, cluster))
    
titles, clusters = zip(*titles_cluster)

In [54]:
num_clusters = len(set(titles_cluster))

titles_embedding = embedder.encode(titles)
ac = AgglomerativeClustering(n_clusters=num_clusters)
ac.fit(titles_embedding)
cluster_assignment = ac.labels_

In [57]:
clustered_titles = [[] for i in range(num_clusters)]

for title_id, cluster_id in enumerate(cluster_assignment):
    title_id = titles_cluster[title_id]
    clustered_titles[cluster_id].append(title_id)
    
for i, cluster in enumerate(clustered_titles):
    print(f'cluster<({i+1})>')
    for row in cluster:
        print(f'paper_id: {row[1]} - {row[0]}')
    print('')

cluster<(1)>
paper_id: 656 - Title 1 Choanoflagellate transfection illuminates their cell biology and the ancestry of animal septins 2 3 Short Title 4 Robust transfection in choanoflagellates 5 6 Authors 7

cluster<(2)>
paper_id: 1 - Public Exposure to Live Animals, Behavioural Change, and Support in Containment Measures in response to COVID-19 Outbreak: a population-based cross sectional survey in China

cluster<(3)>
paper_id: 2 - Classification: Biological Sciences Minor: Microbial Biology Site specific target binding controls RNA cleavage efficiency by the Kaposi's sarcoma-associated herpesvirus endonuclease SOX

cluster<(4)>
paper_id: 654 - SARS-CoV-2 sensitive to type I interferon pretreatment

cluster<(5)>
paper_id: 314 - Kinetic Analysis of Bacteriophage Sf6 Binding to Outer Membrane Protein A Using Whole Virions

cluster<(6)>
paper_id: 732 - Trypsin treatment unlocks barrier for zoonotic coronaviruses infection. Running Title: Trypsin unlocks barrier for zoonotic CoV infection


In [59]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [01:04<00:00, 6.28MB/s] 


In [68]:
titles_embeddings = embedder.encode(titles)

# perform k-mean clustering
n_clusters = 5
clustering_model = KMeans(n_clusters=n_clusters)
clustering_model.fit(titles_embeddings)
cluster_assignment = clustering_model.labels_

clustered_titles = [[] for i in range(n_clusters)]
for title_id, cluster_id in enumerate(cluster_assignment):
    cluster = titles_cluster[title_id]
    clustered_titles.append(cluster)

for i, cluster in enumerate(clustered_titles):
    if cluster:
        title, paper_id = cluster[0], cluster[1]      
        print(f'<cluster={i+1}, paper_id={paper_id}>')
        print(f'title: {title}')

<cluster=6, paper_id=569>
title: A PROOF OF CONCEPT FOR A SYNDROMIC SURVEILLANCE SYSTEM BASED ON ROUTINE AMBULANCE RECORDS IN THE SOUTH-WEST OF ENGLAND, FOR THE INFLUENZA SEASON A PREPRINT
<cluster=7, paper_id=339>
title: MUC5AC drives COPD exacerbation severity through amplification of virus-induced airway inflammation
<cluster=8, paper_id=643>
title: Kin and group selection are both flawed but useful data analysis tools
<cluster=9, paper_id=479>
title: A preliminary study on serological assay for severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) in 238 2 admitted hospital patients
<cluster=10, paper_id=123>
title: PhyloFold: Precise and Swift Prediction of RNA Secondary Structures to Incorporate Phylogeny among Homologs Associate Editor: XXXXXXX Received on XXXXX; revised on XXXXX; accepted on XXXXX
<cluster=11, paper_id=576>
title: Genotypic diversity, circulation patterns, and co-detections among rhinoviruses in Queensland, 2001
<cluster=12, paper_id=227>
title: Genetic 