In [1]:
import os
import pandas as pd
import gc

import tensorflow_hub as hub
from scipy.spatial.distance import cdist

import pytextrank
import spacy

In [7]:
path = 'C:/Users/bill/Documents/projects/data/covid19/open_research/clean_csv/'
clean_comm = pd.read_csv(os.path.join(path, 'clean_comm_use.csv'), nrows=5000)
clean_comm['source'] = 'clean_comm'
#clean_pmc=pd.read_csv(path+"clean_pmc.csv")
#clean_pmc['source']='clean_pmc'
biox = pd.read_csv(os.path.join(path, 'biorxiv_clean.csv'))
biox['source'] = 'biorx'

all_articles=pd.concat([biox, clean_comm])
all_articles.fillna("Unknown", inplace=True)
all_articles.head()

# clean up
del biox, clean_comm
gc.collect()

# load sentence embedding
module_url = 'C:/Users/bill/Documents/projects/data/sents/universal_sentence_encoder_large'
embed = hub.load(module_url)

def get_top_similar(sentence, sentence_list, embed_vectors, top):
    similarity_row = cdist(embed_vectors, embed([sentence]).numpy()[0].reshape(1, -1), 'cosine').reshape(-1)
    results = similarity_row.argsort()
    return [ (results[i], sentence_list[results[i]]) for i in range(top + 1) ]

# find similar sentences
sentence_list = all_articles.title.values.tolist()
embed_vectors = embed(sentence_list).numpy()
sentence = 'Role of the environment in transmission'
similar = get_top_similar(sentence, sentence_list, embed_vectors, 10)
print('similar title to {}'.format(sentence))
for sent in similar:
    print('- {}'.format(sent[1]))
print('\n')
    
# extract title and abstract from similar articles
ind, title = list(map(list, zip(*similar)))
titles = []
texts = []
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])
    
import re
def clean(txt):
    txt=re.sub(r'\n','',txt)
    txt=re.sub(r'\([^()]*\)','',txt)
    txt=re.sub(r'https?:\S+\sdoi','',txt)
    return txt

texts = list(map(clean, texts))
text_list = ' '.join(texts)
print('after joining all the similar abstracts:')
print(text_list[1:300] + ' ...')

similar title to Role of the environment in transmission
- Aerobiology and Its Role in the Transmission of Infectious Diseases
- Regulatory Role of Small Nucleolar RNAs in Human Diseases
- Bioaerosols Play a Major Role in the Nasopharyngeal Microbiota Content in Agricultural Environment
- Environmental Health Outdoor environments and human pathogens in air
- Transmission of Infectious Diseases En Route to Habitat Hotspots
- The role of absolute humidity on transmission rates of the COVID-19 outbreak
- Role of fomites in SARS transmission during the largest hospital outbreak in Hong Kong
- Microglia Play a Major Role in Direct Viral-Induced Demyelination
- The role of respiratory viruses in the etiology of bacterial pneumonia An ecological perspective
- Divergent Roles of Autophagy in Virus Infection
- Title: Transmission potential of COVID-19 in South Korea


after joining all the similar abstracts
bstractAerobiology plays a fundamental role in the transmission of infectious diseases. 

In [4]:
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank(logger=None)
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

doc = nlp(text_list)

print("pipeline", nlp.pipe_names)
print("elapsed time: {} ms".format(tr.elapsed_time))

pipeline ['tagger', 'parser', 'ner', 'textrank']
elapsed time: 130.84745407104492 ms


In [8]:
for phrase in doc._.phrases[:10]:
    #print("{}".format(phrase.text))
    print(phrase.chunks)

[virus infection, virus infection]
[lower respiratory viral infection]
[upper respiratory viral infection]
[disease transmission, disease transmission]
[airborne infectious diseases]
[infection control practitioners]
[viral co-infection]
[respiratory viruses, respiratory viruses, respiratory viruses]
[infection risk]
[MHV infection]
