In [164]:
import pandas as pd
import glob
import json
import numpy as np 
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gila0000\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gila0000\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gila0000\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [165]:
# random functions
def get_random():
    files = glob.glob(r'CORD-19-research-challenge/**/pdf_json/*.json', recursive=True)
    random_files = np.random.choice(files, int(len(files)*.1))
    return random_files

In [166]:
# Write a file reader a file and tokenize the data.

def reader ():
    File = get_random()
    dataframe = {"title": [],"text_abstract":[], "text_body": []}
    dataframe = pd.DataFrame.from_dict(dataframe)
    for i,file in enumerate (File):
        tuples = {"title": None, "text_abstract": None,"text_body": None}
        with open(file) as json_data:
            
            data = json.load(json_data)
        
            tuples['title']=data['metadata']['title']
               
            abstract_text= []
            body_text = []
        
            for a in data['abstract']:
                abstract_text.append(a['text'])
                
       
            for b in data['body_text']:
                body_text.append(b['text'])
            

            body = "\n ".join(body_text)
            abstract = "\n". join(abstract_text)
            tuples["text_abstract"] = abstract
            tuples['text_body']=body 
            dataframe = dataframe.append(tuples, ignore_index=True)

    return dataframe

In [167]:
df = reader()

In [168]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words
stop_words[:20]
len(stop_words)

179

In [174]:
def normalize(txt):
    #txt = reader()
    txt = re.sub(r'[^a-zA-Z0-9\s]', ' ', str(txt), re.I|re.A)
    txt = re.sub("(^|\W)\d+($|\W)", " ", txt)   #remove whitespace and numbers
    txt = txt.replace('title', '')       #remove 'title'
    txt = re.sub('[!#?%*&$)@^(.,-=+:";]', '', txt)       #remove punctuation
    txt = re.sub(r"\b[a-zA-Z]\b",'',txt)        #remove single letters
    txt = re.sub(r'\d+', '', txt)
    txt = re.sub(r'\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b', '', txt)#remove email
    txt = txt.replace('introduction', '')       #remove 'introduction'
    txt = txt.replace('text', '')
    txt = txt.replace('background', '')         #remove 'background'
    txt = txt.replace('abstract', '') 
    txt = txt.replace('\\n', ' ')   
    txt = txt.replace('\n', ' ')
    txt = txt.replace('///', ' ') 
    txt = txt.replace("'", '')
    txt = re.sub(r'^\w\w?$', '',txt) 
    txt = txt.lower()
    return txt

In [175]:
from nltk.stem import WordNetLemmatizer
def wordtokenizing(txt):
    wordnet_lemmatizer = WordNetLemmatizer()
    word = normalize(txt)
    tokens = nltk.word_tokenize(word)
    word_tokens = [t for t in tokens if t not in stop_words]
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(word_tokens)


In [176]:
normalize_corpus = np.vectorize(wordtokenizing, otypes=[np.ndarray])

In [177]:
norm_corpus = normalize_corpus(list(df['text_body']))

In [178]:
tf = TfidfVectorizer(preprocessor= wordtokenizing,stop_words=stop_words)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix .shape

(4594, 263345)

In [None]:
from sklearn import metrics
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

# run kmeans with many different k
distortions = []
K = range(1, 50)
for k in K:
    km = KMeans(n_clusters=k, random_state=42).fit(r)
    km.fit(r)
    distortions.append(sum(np.min(cdist(X_reduced, k_means.cluster_centers_, 'euclidean'), axis=1)) / tfidf_matrix.shape[0])


X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]

# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('Number of cluster')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show() 

In [197]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=10,init='k-means++')
km.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [180]:
%time
%timeit
df['kmeans_cluster'] = km.labels_

Wall time: 0 ns


In [232]:
from sklearn.metrics import silhouette_score
score_max = -1 #this is the minimum possible score
for k in range(1,10):
    silhout= silhouette_score(tfidf_matrix,  df['kmeans_cluster'])
    print ("For n_clusters = {}, silhouette score is {})".format(k,silhout))

For n_clusters = 1, silhouette score is 0.016029273587150627)
For n_clusters = 2, silhouette score is 0.016029273587150627)
For n_clusters = 3, silhouette score is 0.016029273587150627)
For n_clusters = 4, silhouette score is 0.016029273587150627)
For n_clusters = 5, silhouette score is 0.016029273587150627)
For n_clusters = 6, silhouette score is 0.016029273587150627)
For n_clusters = 7, silhouette score is 0.016029273587150627)
For n_clusters = 8, silhouette score is 0.016029273587150627)
For n_clusters = 9, silhouette score is 0.016029273587150627)


In [252]:
text_clusters = (df[['kmeans_cluster', 'text_body']]
                  .sort_values(by=['kmeans_cluster', 'text_body'], 
                               ascending=False)
                  .groupby('kmeans_cluster').head(20))

In [257]:
topn_features = 9
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
ordered_centroids

array([[ 35975, 252151, 141888, ..., 151979, 151978,      0],
       [ 37783,  37609, 146025, ..., 164159, 164158,      0],
       [174775,  50635, 183608, ..., 168811, 168810,      0],
       ...,
       [101310, 193394,  34876, ..., 166309, 166308,      0],
       [144409,  50562, 144415, ..., 173157, 173156,      0],
       [191341, 252151, 205598, ..., 162336, 162335, 131672]], dtype=int64)

In [258]:
feature_names = tf.get_feature_names()

In [259]:
for cluster_num in range(10):
    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
    print('-'*20)
    print('Key Features:', key_features)
    print('Text :', texts)
    print('-'*80)

IndexError: list index out of range

In [None]:
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
for cluster_num in range(0,10):
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
    print('-'*80)
    
       
    text_summary = wordtokenizing(texts)

    tf = TfidfVectorizer()
    text_summary  = [text_summary ]
    tf_mat = tf.fit_transform(text_summary)

    matrix = (tf_mat*tf_mat.T)
    nx_form =nx.from_scipy_sparse_matrix(matrix)

    ranks =nx.pagerank(nx_form)

    similairity = np.zeros([len(text_summary), len(text_summary)])
    from sklearn.metrics.pairwise import cosine_similarity
    cosin = cosine_similarity(fitting[0:1], fitting)
    trans = cosin  * cosin .T

    nx_graph = nx.from_numpy_array(trans)
    s = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((s[i],n) for i,n in enumerate(text_summary)), reverse=True)
    print("Indexes of top ranked_sentence order are ", ranked_sentences)




In [None]:
for i in range( len(texts)):
    print(ranked_sentences[i][1])
   

In [231]:
for cluster_num in range(0,10):
    print('CLUSTER #'+str(cluster_num+1))
    print('-'*80)
    text_summary.append(" ".join(ranked_sentences[cluster_num][1]))

    # output the summarize texr
    print("Summarize Text: \n", ". ".join(text_summary))

CLUSTER #1
--------------------------------------------------------------------------------
Summarize Text: 
CLUSTER #2
--------------------------------------------------------------------------------


IndexError: list index out of range

In [219]:
for cluster_num in range(0,10):
    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
 
    print('-'*80)
    nlp = en_core_web_sm.load()

# add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    doc = nlp(str(texts))
        
     
    for token in doc._.phrases:
        print(token.text, '|', token.rank)

   

CLUSTER #1
--------------------------------------------------------------------------------
sars infection | 0.06696761625398835
sars patients | 0.06695687963014531
sars cases | 0.06570966553878833
sars | 0.06419141548548374
acute sars infections | 0.06312412696317964
acute sars patients | 0.06311440722591405
adult sars patients | 0.06305159318433111
pediatric sars patients | 0.06304142903615192
sars transmission | 0.06277620664328966
probable sars patients | 0.062441194992379316
new sars cases | 0.062364973247293114
recovering sars patients | 0.06236222377135184
cumulative sars cases | 0.06177574479923795
severe sars disease | 0.06144655495180787
sars outbreaks | 0.06111526627320037
sars specimens | 0.060867130569732884
sars accumulative cases | 0.06079878037008257
sars severity | 0.060792221777831806
acute sars | 0.060510827239188575
pediatric sars | 0.06042161202762328
sars coronavirus | 0.06021711261085862
sars vaccine development | 0.06006946515436451
sars disease enhancement | 0.

permissive cell-lines | 0.021104597928064135
the specific virus | 0.021094757574577756
mers-cov. | 0.021092783525554635
large numbers | 0.02109177293119916
healthy controls | 0.02107986503067409
day | 0.02106109287773683
human and veterinary vaccines | 0.021032982522542525
chimeric mers vlps | 0.020978485140801122
crystallographic data collection | 0.020966068701439328
a comprehensive gene expression profile | 0.02094432589819889
an infectious tuberculosis patient | 0.020934077622417543
efficient human-tohuman transmission | 0.02093364312335901
macrophage or epithelial cell origin | 0.020933313538684266
mers-s | 0.02093285502561501
rna | 0.0209270716045271
smaller number | 0.020925920590369657
humanto-human transmission capacity | 0.020924134066622066
respiratory failure | 0.02091994824767677
respiratory route | 0.02091573274811982
specific technology platforms | 0.02090795192466476
spot-forming cells | 0.020887902625196415
the mers-cov cellular receptor | 0.02085290415448317
dpp4 | 0.

personal protective equipment.\n data | 0.014120214962916829
level | 0.014102703314849425
slaughterhouse personnel | 0.01409125026378925
neither the infection status | 0.014081874701970242
the x-ray crystal structure | 0.01407863928935609
the high press coverage | 0.014075699539348174
strains | 0.014064502720965712
hand gloves | 0.014058660453529298
the highest peak | 0.014047941706550621
the available data | 0.014047511609930648
s1 domain | 0.014046280017794073
the hong kong outbreak | 0.014042680967524155
a lower reproduction number | 0.01402068651230366
recombinant baculovirus m1-st | 0.014017323691567902
a significant respiratory impairment | 0.014015616142800348
chronic renal failure | 0.014002919613932914
an attractive model | 0.014002376304141447
an alternative model | 0.014001489499548977
a valid model | 0.013999936686116276
polytron pt2100 tissue grinders | 0.013997760638628405
laboratory investigation | 0.013991211429288924
family and hospital outbreaks | 0.013987609253090009

ifn-β | 0.009399564256493501
the same gap procedure | 0.009397109465904434
the interaction patterns | 0.009395906307624734
the present strategy | 0.009387449036422918
physical examination | 0.00938431510785054
asn229gln mutant | 0.009383989196266661
healthy women | 0.009382750428713876
surface plasmon resonance | 0.009379604789576064
the gap-derived severity | 0.00937727689278852
the prolonged suspension period | 0.009375374257017757
seat assignment proximity | 0.009371416180925362
intranasal inoculation | 0.0093645986741111
the low-titer baculoviruses | 0.009354129656368749
the tested length | 0.009335091245788726
droplets | 0.00932731015217909
combinational ribavirin | 0.00932583563513003
key components | 0.009320614029159364
grand island | 0.009318433244630681
proprietary adjuvants | 0.009314649339637588
sore throat | 0.009314566436421317
peptides | 0.00930753419794678
the envelop spike | 0.009306961026605233
the first meeting | 0.009306406960524214
6 following transfection | 0.0093

as37 | 0.0065737512054573136
embase | 0.006568599591888696
the total cu-\n dn | 0.006555951886079203
immunostaining techniques | 0.00655406320954307
an spike s1 | 0.006552998174704704
the crystallographic software | 0.006551030232504143
immunohistochemistry | 0.00654536647804523
endotracheal intubations.\n | 0.006529830471143335
the common marmoset | 0.006522420373806136
hdpp4 | 0.006522267587343614
the baculovirus platform | 0.006516859238722278
c407 | 0.006511254843564785
subpleural consolidation | 0.006510748865931733
the log ratios | 0.006508277049909015
the causative agent | 0.006505243368286288
this novel pathogen | 0.006501679397359893
a "global threat | 0.0064968345733297135
stomach | 0.006496315306281493
monocyte-tropic chemokines | 0.006494909388366311
the heptad repeat 2 region | 0.006494382261715711
the cellular exudates | 0.0064936458881447254
a relatively flat surface | 0.006491802793722901
ad5-hdpp4 transduction | 0.006491480324624357
gastrointestinal bleeding | 0.006487

our understanding | 0.004871826573488568
the tube | 0.004870586666172939
the airline industry | 0.00486560955888496
the patient\s nasopharyngeal aspirate | 0.004865147583033329
cambridgeshire | 0.004859409425277652
version 2 chips | 0.004856117671798239
pubmed | 0.004853405885378574
hae.\n | 0.004850363139366102
aa | 0.0048495879377647485
dc-sign | 0.004848693616061558
publication | 0.004848562050511669
a suspension | 0.004846835724440406
the exam cost | 0.00484089266088129
som | 0.004836701922200499
a symmetric matrix | 0.004835849006406735
a paradigm shift | 0.004835523999155795
oman | 0.004828560927724526
conjunctiva | 0.004827974387819491
mhv | 0.004825028212340931
the us centers | 0.004824348160711952
the mainland of china | 0.004821858043822069
the core | 0.004821567038174287
an overall estimate | 0.004817706656622622
this 52-gene molecular signature.\n | 0.00481474237639549
cotransfection | 0.004809093169950495
the united states of america | 0.004808205604144503
hydration | 0.00

no consensus | 0.0015025927066692705
the rapidity | 0.0015025927066692705
| december | 0.0015025927066692705
3 box | 0.0015025927066692705
a diameter | 0.0015025927066692705
a continuum | 0.0015025927066692705
(i.e. nebulizers | 0.0013098272266451486
the in vitro ligation | 0.0012973614596099303
to 10 000fold | 0.0012901292377453055
/million pbmcs.\n | 0.0012844287838811304
the 11.6 nm | 0.0012782653209291279
3-4 kg | 0.001164408530260257
(103) nization | 0.0011631494964273327
10 mm hepes | 0.001090543828908797
the troops-hcws | 0.001090543828908797
the . https | 0.001090543828908797
the . https://doi.org/10 | 0.001090543828908797
approximately * 157 kda | 0.0010528428333764408
31-33).\n portions | 0.001023206739949861
about 5 or 6.4 d[11 | 0.0009290465804967544
(vol/vol | 0.0008854112435394085
the 1-12-year-olds | 0.0008756567015796192
(that is, quarantine | 0.0007602584557811476
it | 0.0
CLUSTER #2
--------------------------------------------------------------------------------
other

sick bats | 0.032483007550369536
human pathogen species richness | 0.03247505591589143
attenuated viral vaccines | 0.03245367366177087
migratory bats | 0.03244808971073661
viral metagenomics | 0.032446593265158925
viral contexts | 0.03242812236498417
hoary bats | 0.03242761083772098
serotine bats | 0.0324272673009895
affected bats | 0.03242586472286371
similar viral loads | 0.0324168623737259
viral therapy | 0.03241456760974182
picivorous bats | 0.03240340668220641
frugivorous bats | 0.032400550444386965
high inter-species entropy | 0.03239872405444938
endangered bats | 0.032386522168369854
viral genomics | 0.03237455125313294
viral stocks | 0.03236400534691321
german bats | 0.03235895906792603
viral threats | 0.032351891929150056
appropriate pcr primer pairs | 0.03233157360674045
many cells | 0.032319418821335044
known rabid bats | 0.032316835280373955
other bacterial species | 0.032283354980402565
non-adapted hosts | 0.03226975702677794
captive egyptian fruit bats | 0.032197039015489

rna transfection di rnas | 0.024156410280433138
human areas | 0.02415404220675621
infectious diseases | 0.02415369382730801
the inserted intergenic sequence.\n virus-specific rnas | 0.024152652641823497
other nidoviruses | 0.02414441836839845
poly(a)-binding protein | 0.024134876913659967
association analysis | 0.024130926253198775
clinical disease | 0.024123196247221357
rna-rna | 0.024119025210621498
cross-annealing.\n a reference sequence | 0.024118756864049632
bacterial disease | 0.02410725255283766
flavivirus genome replication | 0.024105162203092282
a subgenomic rna | 0.024097324786904656
dna-protein interaction study | 0.024089072104780514
other drugs | 0.024067941043500112
dna synthesis | 0.024066824593970016
other taxa | 0.02406352108217263
internal reference genes | 0.02406215628990387
an upstream or downstream rna sequence | 0.024057854048004357
steadystate gene expression levels | 0.024057021860911663
the genomic di rna | 0.024026359938733626
other individuals | 0.0240191688

bat-associated zoonoses | 0.021003005072734747
expected number | 0.02099165234747133
a previously published lassa virus | 0.020990073257623556
infected boa constrictor tissues | 0.020989736574625047
avian proventricular dilatation disease | 0.02098952609441168
larger body mass | 0.02098034875667257
a complete viral database | 0.02097467655746602
northern lower level | 0.020971659353587514
newcastle disease poultry | 0.020962144182942136
genome fragments | 0.020945677562365178
specific positions | 0.020945237794300332
bud blight disease | 0.02094144594218652
the partial s protein region | 0.02094089743548837
the 1918 spanish influenza virus | 0.020940293470182666
inactivated phylogenic analysis | 0.0209379722260557
different biological properties | 0.020929657562586846
higher assembly accuracies | 0.02092872590002043
the non-amplification nucleic acid detection methods | 0.02092785924030493
trsv replication | 0.020918421452434798
small rnas | 0.020916844467290658
different incubation te

new geographic areas.\n | 0.017267219459775923
similar size | 0.01726162326902308
fig 3f | 0.01725855422643283
reaction products | 0.017252830216412444
support data structures | 0.017248108395817176
rna quantity | 0.017246172584592722
di-nrui subgenomic rnas | 0.017230864746923005
large quantities | 0.01721715080007461
complimentary sequencing adapters | 0.01721370547223628
cov vaccines | 0.017211186071026664
a small ring domain-containing zinc binding protein | 0.017208889997399245
the high replication fidelity | 0.01720573150275024
(vero-76 cells | 0.017198913234608634
synonymous, non-synonymous or non-coding mutations.\n | 0.017187056963557268
an rna clean and concentrator column | 0.017185331401443505
various cellular processes | 0.017182897831644936
generalized additive models | 0.017182672628370522
region-specific terms | 0.017182024710396177
a full-length complementary negative-strand rna | 0.0171795191899764
a novel natural host | 0.017176820544216147
pcr and real-time pcr test

important functional consequences | 0.014501830658306472
molecular mechanisms | 0.014496441904449313
the human immune system | 0.014496283374699737
0.2% diethyldithiocarbamate | 0.014495768489769386
sgmrna levels | 0.014495241042139497
several european countries | 0.014493748331045409
naked-rumped tomb bats | 0.014493167158453977
habitat types | 0.01449161563022455
0.3% sds | 0.014490758187570282
1114 silver-haired bats | 0.0144906262696436
many modern characteristics | 0.014490378724045593
the annulated tree boa samples | 0.014487335472800894
known or unknown disease markers | 0.014482151385248249
10% bleach | 0.014481905186669171
71% reduction).\n | 0.014481905186669171
5% w.r.t | 0.014481905186669171
5% co | 0.014481905186669171
low-quality, lowcomplexity, and host-derived sequences | 0.014481249741769816
cavv orf1b expression | 0.014475396124245546
these pedv strains | 0.014474835940189579
respiratory, gastrointestinal and other infections | 0.014474158966494012
each given zoogeogr

additional bands | 0.012599908023700009
the v9 region | 0.012598649943471244
roost size | 0.012598628350598221
sic and lic samples | 0.012594586136254495
unique barcodes | 0.012590065094861876
the ribosomebinding region | 0.012574459550028307
the 3=-proximal region | 0.012573736157190304
the nterminal region | 0.012573611478191065
horizontal transmission | 0.012571849022815997
warmer colors highlight areas | 0.012567926603200507
zoonotic spread.\n | 0.012565713673013321
pig producers | 0.012564170997403777
the estimated number | 0.01256293559557765
the low quality ends | 0.012557147729256462
significant k-values | 0.012553917421015168
this pooled sample | 0.012552865536934953
suckling piglets | 0.012551710134755756
pairwise distances | 0.012548208789321877
shared evolutionary history | 0.012543415925187247
the observed number | 0.01254295634771427
experimental evidence | 0.012542510778316718
species-specific markers | 0.012541657827589484
the cavv replication apparatus | 0.012540544957

most two other accessions | 0.011090338752939464
association | 0.011089723539176637
temperature | 0.011087540946177227
di-d20 stui | 0.011081233022649531
definition k-mer | 0.011081194049444213
three-step pcr | 0.011079620818191705
significant hurdles | 0.011079402344084749
evidence | 0.011077290760412951
step k-mers | 0.011075653075530087
real-time pcr reliability | 0.011074184359171013
low-stringency wash solution | 0.011073361309101113
numerous examples | 0.01107302502132445
varying proportions | 0.011069866618526096
(rva) reference genomes | 0.011067223466915885
live-vector vaccines | 0.011066180638899432
cdna clones | 0.011065938886387773
the associated rat population | 0.01106390259063443
terminating base | 0.011056581757918373
asian influenza | 0.011054683747071845
the input data | 0.011053682501934633
the allele effects pattern | 0.011053555779245888
the median eqtl-gene distance | 0.0110523304868852
local regulations | 0.011049448027733218
conventional kobuvirus-negative pigle

the last k-mer occurrence | 0.009574832922400241
the culture scale | 0.00956903490089323
the year.\n strand-specific rt-qpcr | 0.009568028967275398
the 18-nucleotide-long analysis | 0.009563604967122157
ribosome binding | 0.009557800721495605
the mean isolation rates | 0.009557046336047282
various and unexpected sizes | 0.009555117636592695
active rabies | 0.009550129164380042
de novo the published ones | 0.009546944511129871
an ancestry-based approach | 0.009545699675934782
avian myeloblastosis | 0.00954412004551339
a hitachi h-7500 transmission electron microscope | 0.009539902708741886
blastn search options | 0.009539586786281848
the first generation | 0.00953617577060066
amino | 0.009534752670711242
etiological diagnosis | 0.009534444196779773
avian bornavirus | 0.00953196214926882
≤100 copies/reaction | 0.009531799528723515
experimental section.\n nucleotide collection | 0.009531119962814152
proximal readthough element-distal readthrough element | 0.009527479870263391
mapping | 0.

a functional t7 promoter | 0.008254258537831568
the body wall | 0.008253459694130338
eukaryotic organisms | 0.008252067803919235
min | 0.008250526703969883
titer determination | 0.008248602519643435
barcodes | 0.008243110250862912
diarrheic episodes | 0.00824247180980868
an r library | 0.008241134408173973
all positive pools | 0.008238384002600888
two characteristic zinc-binding domains | 0.008236276150921602
vitamin e | 0.008235124716290001
archival (2014) fecal samples | 0.008234457211628885
sybr green fluorescence | 0.00823298278024656
genotype probabilities | 0.008229045535518438
(4) allergic airway disease | 0.00822495092499438
global dissemination | 0.008223202915392965
nutritional needs | 0.008222928650154296
sow milk | 0.008220547578164332
budding stage | 0.00821913170174959
a similar report | 0.00821626511363168
putative rfs | 0.008216259742970786
targeted-approaches | 0.008215165256315334
adult workers | 0.008213019210909982
iucn status | 0.008211455847075425
the established 

pedv propagation | 0.0071034087896845385
enough room | 0.007101093399294367
the extreme end | 0.007099448554791761
the endemic presence | 0.0070992314675534934
gradual accumulation | 0.0070985415317979484
a stable mutator phenotype | 0.00709808450779177
the cycling process | 0.00709728046215474
nonspecific annealing | 0.007096655453353514
sars-cov-2 | 0.007096336995155128
our complete dataset | 0.007093514823065889
heavy economic losses | 0.007093480688041495
application | 0.007091153706896249
the family roniviridae | 0.007089420368220214
fatal hepatitis | 0.007087825451925949
an rgd motif-dependent integrin-mediated entry pathway | 0.0070851425071617015
the north american outbreak | 0.007084102248074918
belgian rivers | 0.007082903084041833
a simple test | 0.007081127407101694
caudovirales and families siphoviridae | 0.007079659873363846
cdna | 0.007078076968764028
spectroscopy.\n library preparation | 0.007078017247752136
fdi-sacii | 0.007077711631793951
a defined ratio | 0.007077586

space occupation | 0.006222754505001253
(c ) founder | 0.0062165424901849415
this arna proteindetection technique | 0.006215929993562682
the ngs platforms | 0.006215436268564856
max-effort | 0.00621404645652133
a robust investigative platform | 0.0062130052386566775
particular emphasis | 0.0062128405446163605
cap methylation | 0.006210245517626872
the amplified 603-nt fragments | 0.006209522145664785
the easy adaptation | 0.0062092341853985
symptom onset | 0.006208216368374299
the major advantage | 0.0062076158219081115
em bed812 resin | 0.006204598312712531
a 0.4 m concentration | 0.006204004954298277
the internal controls/markers | 0.0062025128210806725
dogs | 0.006202243094122633
heuristic branch | 0.006200706311578921
testing | 0.0061993523287489465
oligonucleotides | 0.006197766519799924
the decreasing costs | 0.006197193205508175
flavivirus circularization | 0.006192083784812209
these or similar terms | 0.006190401125686003
the true example | 0.006190332617240312
transcript signa

any thread or process parallelism | 0.005229822459249883
pv39 | 0.0052292854190090655
dilution endpoints | 0.005228202636074026
compression scheme | 0.005228109612459825
the divisive amplicon denoising algorithm | 0.0052279937129008055
a local aquarium | 0.005227215394072688
the early lineages | 0.005226605320359352
the all mammal dataset | 0.005225242599785529
parsimony seed | 0.0052243871455478555
success | 0.005222859029855658
abscesses | 0.0052227542499821535
our best-fit gams | 0.005221355668867209
column de novo | 0.005219595158219588
nonhuman pathogens | 0.00521779457673197
vaccination | 0.00521524608022096
the computationally expensive steps | 0.0052149385097040365
premix ex taq | 0.005214467472750079
fine adjustments | 0.005213538304706495
pasture statistics | 0.0052132532437464095
organs | 0.005212499797134525
branches | 0.005212325972213387
the "small" nidoviruses | 0.00521092707029997
the phenotypic qtl | 0.0052093136859089565
return | 0.005208536735887859
the n-terminal po

a glycoprotein precursor | 0.004508699190734443
variance | 0.004507985976029812
nextera xt | 0.004507871405998907
their sole source | 0.004505660482671524
even brain microvascular injury | 0.004498799713649871
km820707 | 0.004495389762039044
each richness map | 0.004491923097704514
the international committee | 0.004490484784022514
topical administration | 0.004489724930818738
the road map | 0.004488197782784547
perissodactyla | 0.004488029745000326
a dry cotton rectal swab | 0.004487981143729209
only the first four digits | 0.004487720921506425
macronyssu | 0.004485804466293058
the renal lesion | 0.004484753811514319
the following year | 0.0044840506826392835
west caucasian | 0.004483506703982743
adenovirus | 0.004482610303402031
(1) nsp14 exon | 0.004482215631109916
marthaler | 0.004478446533839216
sepsis | 0.0044773671992564875
the mouse diversity | 0.004475885453752032
more work | 0.004475874073157689
the gel | 0.004475848087568821
bydv | 0.00447572305811784
the successful syntheti

mh098953 | 0.0037666226021126677
trimethoprim sulfa | 0.0037665459673452027
abb, ont | 0.0037656749365782617
letters | 0.0037651381557171405
trial | 0.0037633674931914452
bristol-myers squibb | 0.0037605035484601732
lonza | 0.0037603147664754155
cardiac junctin | 0.0037597816517498527
wallace | 0.003759315664110275
i.e. σ x | 0.0037585968919283054
either an expanded tropism | 0.003757720909346549
e.d. | 0.003756936117797178
pseudomonas | 0.003756790304282346
reli 4816 scanner | 0.0037544343797801336
inactivation | 0.0037538031014872964
ped | 0.003753753423703183
de-pe | 0.0037491172628996206
f 1 hybrids | 0.003748152113254471
rna-1 | 0.0037479973431814835
the basis | 0.003747833355912903
sic | 0.0037478150671535915
artibeus jamaicensis | 0.0037473869832898377
starting | 0.0037467385920657474
cultures.\n immunofluorescence | 0.0037456327113836703
mendelian | 0.003745366577175547
incorporation | 0.003745313641819831
genbank | 0.0037450910816237003
their genetically modified derivatives |

pockets | 0.003084723097496033
the sarbecovirus subspecies | 0.0030839939622760533
diamus | 0.003083892615581017
calves | 0.0030835023934915867
more importantly, a wholesale effort | 0.003081145001356782
relatedness | 0.0030811318425897514
the haplotype inference | 0.0030805066094713256
the haplotype reconstructions | 0.0030805066094713256
cyclin | 0.003079083936353179
spike | 0.0030789776691536033
igh-2 | 0.003078871914569351
suppress | 0.0030788405670625716
3a | 0.0030786381500408737
1 u udg | 0.003077297178652039
pc21a | 0.003075181861019358
hadziavdic | 0.003073313728059356
the big dye terminator | 0.0030727356447723727
rnalater | 0.0030721760892765478
alpha | 0.00307144912245065
sgmrna-n | 0.0030709697199375165
dinchuk | 0.0030701254186102324
chesler | 0.0030699775444779157
callahan | 0.0030699775444779157
jaitin | 0.0030699775444779157
dutp | 0.00306967172296889
gambian | 0.003069351805148313
reptiles | 0.0030682918490382593
arthropoda | 0.003067394379177798
a proliferative glome

sgrnas | 0.0025958506179755113
viz | 0.0025957685446744566
bip | 0.0025955177550316096
nod/ltj | 0.0025952242832026763
a retrospective view | 0.0025950691699702304
economies | 0.002594794664904276
psiblast | 0.0025946514054144528
arenaviridae | 0.0025941876593707047
eukarya | 0.0025941046915207017
materials and methods | 0.0025934613005766482
a gradient | 0.0025934224481698187
speciation | 0.002593400083464972
awareness | 0.002592444188429102
faeces | 0.002592019478158727
the filipino rainforests | 0.00259200561135397
officials | 0.002590352999063622
entrance | 0.0025898371352600824
hampers | 0.002588867621364159
the genus carmovirus | 0.002586882655608634
cavally | 0.002586693748765765
tswv | 0.002586308027178428
nepovirus | 0.0025861146239468022
(atcc no. ccl-81 | 0.0025858170157781533
sk | 0.0025857585341018097
longevity | 0.0025853661465776614
gammacoronavirus/avcov/chicken | 0.0025852172015749016
supplementation | 0.002584984507316855
raxml | 0.0025847414393945357
\cophenetic\ | 0

both axes | 0.0018479081308914467
the algae | 0.001845684775459444
the island | 0.0018453768136819633
(elb | 0.0018429734352067534
mm hepes | 0.0018343416739851895
the tops | 0.001828082881650221
the ⊕ (xor) operator | 0.0018230185094179096
/y | 0.0018229423439369825
the sequence(s) sur-ciency | 0.0018203144191487916
a paramyxovirus | 0.001817222412305618
(iv) villages | 0.0018138108486291285
the topic | 0.001808339748771973
the matchability | 0.0018068227853248078
-label | 0.0018062241727556192
a draft | 0.001801934709667689
this chapter | 0.0017989802258130576
a 2-to-3-fold decrease | 0.0017898127183859176
the tombusvirus | 0.0017895371839403154
the robustness | 0.0017878798412164017
e.g. cingulata | 0.0017868838974712106
(serotec | 0.0017824127531066145
nsp16 2\-o-mtase | 0.0017800510462207377
16 kb | 0.0017777585137820611
a highdensity | 0.0017739766040500257
their matching | 0.0017713182216291825
(dar | 0.0017712954495264172
the onset | 0.0017693314196362876
16s/18s rrna | 0.00176

respiratory virus infections | 0.060234501930960964
respiratory viral infections | 0.056358858655291805
virus infection | 0.05627271137701644
common respiratory viral infections | 0.05474026042561311
viral acute respiratory infections | 0.05473337356223829
study patients | 0.054599485692485135
nipah virus infection | 0.053250439771988546
respiratory infections | 0.053195283875667465
nipah virus patients | 0.05241594175909804
other viral infections | 0.05226283935242169
lower respiratory infections | 0.052099380514032505
viral upper respiratory tract infections | 0.05202891305291267
respiratory tract infection | 0.05154836388010621
viral infections | 0.05135248660915194
other respiratory viruses | 0.05119177354604187
rv respiratory infection | 0.050910268423749884
infl uenza virus infection | 0.05079962750179784
lower respiratory tract influenza infections | 0.050723059335397266
lower respiratory tract infection | 0.050662874337973784
acute respiratory tract infection | 0.05044943720504

deceased case-patients | 0.028394418425400192
the non-ari patients | 0.028384088561696958
clinical improvement | 0.02837669145712276
case-patients | 0.028372533776721505
hospitalized cases | 0.028370348855940995
adult asthma exacerbations necessitating hospital care | 0.028365600701314567
33 children ages | 0.028360382379058555
emergency room treatment | 0.02835707952164452
other common forms | 0.0283535654745304
severe acute respiratory syndrome-associated coronavirus | 0.02830497456689881
pathogenic viral dissemination | 0.02828296567311533
clinical outcomes | 0.02826797324925006
the other co-viruses | 0.02823586163778885
post-mortem stored chinese lung tissue samples | 0.028225795754612
different clinical presentations | 0.028217732104190586
the disease.\n hiv infection | 0.02821357972465413
rna detection | 0.028211669273336924
the median patient age | 0.028191985172706557
other specimens | 0.02818278610359383
other sot recipients | 0.028181077568461277
cases | 0.028179350477381925


metropolitan and non-metropolitan government non-teaching hospitals | 0.022594524134916978
70% overall coverage | 0.022591697578789992
varicella-zoster virus | 0.02258604113215113
these virus particles | 0.022584992846494525
pneumonia admissions | 0.02258214955370694
the study area | 0.02258034933352327
poorly conducted studies | 0.022575726154746435
homeopathic vaccines | 0.02257030171791647
novel a(h1n1) virus | 0.022566475136220777
local penicillin resistance rates | 0.022563234820420344
avian and novel swine viruses | 0.022554223020633608
non-typhoidal salmonella spp | 0.022530858573400116
oral polio vaccine | 0.022529407950011505
pandemic vaccines | 0.022527296224128412
the novel virus | 0.02252684557896828
a prospective blinded study | 0.02252217934921637
group b rv | 0.022521709367620574
the primary study | 0.022521064806312375
administrative health data | 0.02251867510972093
severe thrombocytopenia | 0.02251492515992586
epstein-barr virus | 0.022509831759276754
larger groups | 

adverse effects | 0.017675871030127698
recommended standard practice | 0.017665662544218048
uncontrolled, clinical reports | 0.017662161273431374
most trials | 0.017660681912932272
x-ray examination | 0.017657360414404622
the reference age group | 0.01765126973402114
influenza vaccination status | 0.01764934226654319
distinguish infants | 0.017644875838003844
bal collection).\n study cohort characteristics | 0.017640164476939505
hiv-infected mothers | 0.01763314749710026
human subjects | 0.017628902870067784
more than one virus | 0.017626020639435924
many government health clinics | 0.017624212162356597
steroid doses | 0.01761403259287151
the pandemic a(h1n1) virus | 0.017609789831459385
oral antibiotics | 0.017606628890279516
rt-pcr products | 0.017599410030418684
the improved detection rate | 0.017599193356143468
transmission and/or disease severity | 0.017597859749882774
232,391 treatment acceptability | 0.01758689208056431
both treatment arms | 0.01758689208056431
hospital stay | 0

hrv-c | 0.014599594443545357
n p | 0.014597357542719385
pneumonia hospitalisations | 0.01459574648000938
promising evidence | 0.01459357544402502
the other hand | 0.014593269607814381
chronic bilateral enlargement | 0.01459298429829225
viral, bacterial (aerobic and anaerobic cultures | 0.014585448825094684
oxidant stress response | 0.014577637313362344
pegylated recombinant human megakaryocyte growth and development factor | 0.01457395614270375
a licensed seasonal trivalent influenza vaccine | 0.014569316651431711
household contacts | 0.014567599774068803
childhood enuresis | 0.014565485071467415
iv penicillin | 0.014556636833072532
tympanostomy tube failure | 0.01455467933189123
iv amphotericin b infusions | 0.014553212769909278
household transmission | 0.01455094277234321
conventional medicine | 0.014549845687181907
pandemic a(h1n1) disease | 0.014545123449232289
culture-based methods | 0.014542326298263498
a retrospective, single-center study | 0.014537603938195856
a high percentage

the dominating pathogens | 0.012154882571476028
medication costs | 0.01215281059433002
vitamin a | 0.012151840142690088
immediate improvement | 0.012151743497398993
the wilcoxon ranksum test | 0.012151615279679328
a lower frequency | 0.012147860832973263
general practitioners | 0.012142630792337985
multiple occasions | 0.012140005790092038
duration | 0.012134644314320472
iv chloramphenicol | 0.012133659006012548
allograft rejection | 0.012132191941761343
remote areas | 0.012129301525654456
encephalitis-like illnesses | 0.012127863229341913
our test panel | 0.01212722995708631
poor parent-infant interaction | 0.012126863220816563
better sensitivity | 0.012126176859898644
a statewide laboratory dataset and hospital morbidity records | 0.012125189322107455
areas | 0.012123704610145784
f | 0.012119532915725276
cervical/ perinatal transmission | 0.012119271144282677
iv acyclovir | 0.012115723183809574
homeopathic preparations | 0.012115332805064704
prolonged shedding | 0.012115324179638984


the summer months | 0.0103347230337474
continuous variables | 0.010334101854392813
a central laboratory | 0.010330769058248332
baseline antibody titers | 0.010328823191485682
prior records | 0.010327151712329126
combinations | 0.010326977140445767
increases | 0.010320818988569637
airway reactivity | 0.010313937348647427
3 large retrospective series | 0.010313639955521587
approximately 10 5 median tissue culture infectious doses | 0.010310917760993302
social services | 0.010310742370027196
complications | 0.010310023531088593
long-term complications | 0.010308369935752529
only scientifically proven therapies | 0.01030530955868139
an open lung biopsy | 0.010304555915914646
an open label trial | 0.010301426841696876
another significant complication | 0.010300266764137499
ml | 0.010293226422141841
chinese | 0.010292374124222695
ev)-5′ noncoding region | 0.010286164461484082
more than 1500 cells | 0.010282926217500795
infected allografts | 0.010282188316763589
the antiretroviral use | 0.010

hearing loss | 0.00853473138860299
the recommended prophylaxis | 0.008533507296593357
251 a large-scale randomized trial | 0.008532685291045318
the most common deficiency | 0.008532420756907988
a nonimmunocompromised population | 0.008528886728420521
organisms | 0.008527457234069993
the most common serotypes | 0.008525634216778308
good accessibility | 0.008525116392343733
michael nissen | 0.00852498023229696
ecmo support | 0.008521116931081048
haemophilus influenzae | 0.008520618071478126
parental confidence | 0.008518878180171472
household quarantine | 0.008517090158562751
minute | 0.008514628275943112
s. norwalk | 0.008513784728969414
the immunocompromised and normal host | 0.008513333377851913
parental concerns | 0.008508604188220489
single | 0.008508578368288479
the major outcomes | 0.008507952302636125
allograft injury | 0.00850447916808363
identification | 0.00850032801066197
an immune-mediated disease | 0.008498376294425927
the most common coinfection | 0.008498032463184379
59 d

northern california | 0.006954850166971579
the 19valent technique | 0.00695212335181119
the assay | 0.006951669304496796
the naturopathic ear drops | 0.006950017856157884
monitoring | 0.0069498341679117
full disclosure | 0.006948935345550978
trace elements | 0.006946102801691551
more than 36 hours | 0.006942948593335401
the seasonal distribution | 0.006940595379103605
271 continuous research | 0.006939417811767487
127 a placebo-controlled, double-blind trial | 0.006937758624255021
figure s12 | 0.006937741733792
specificity | 0.0069370713188772306
autogenic training | 0.006931010977669091
the prospective open label | 0.00692876131915255
more than 48 h | 0.006928110596910412
behavioral conditioning | 0.0069272474009055215
ampicillin powder | 0.006922177588408979
major histocompatibility complex-ii | 0.006921437684033145
special consideration | 0.006920977973847112
any coexisting condition | 0.006920932458022475
the previous seasons | 0.00692053638550644
ϫ70њ c. collection | 0.00691551286

clinicaltrials.gov | 0.005709989689166877
⁄ | 0.005709989689166877
fli5670_hrva10_argentina_2010.71 | 0.005709989689166877
jx129408 | 0.005709989689166877
ab548898 | 0.005709989689166877
ab548899 | 0.005709989689166877
fli6051_hrva51_venezuela_2010.71 | 0.005709989689166877
cytokine | 0.005709989689166877
nct00562484 | 0.005709989689166877
pro-\n performing aerosol-generating procedures | 0.005708103843476264
watery eyes | 0.0057067745341151666
an abdominal computed tomography scan | 0.005706254049943647
fdr adjustment | 0.0057046692018343555
the second experiment | 0.005704215994129542
the northern and southern regions | 0.005704143089151512
mucosa | 0.0057021820439521415
infantile gastroenteritis | 0.005701332761764745
visual screening | 0.005697812106894889
professional and lay services | 0.005696569876731746
a polyvalent antibody | 0.0056915549878571985
a cdna synthesis procedure | 0.005690826167552659
myeloablative regimens | 0.005690685812799124
the individual features | 0.005687

tmp-smx | 0.00450870453598734
espoo | 0.004507469552927968
an animal experiment | 0.004506225054511061
tympanostomy | 0.00450420069437955
italy | 0.004498540486873611
sanofi-pasteur | 0.004497053283446839
the season | 0.004494553763662207
interview | 0.004493142990663843
parkville | 0.004493028515997402
an annual basis | 0.004492872204675246
bangladesh | 0.004490733383180621
csl ltds | 0.004487863325633178
"therapeutic orphans | 0.00448761686585012
citrus | 0.0044861253714753935
popularity.\n naturopathy | 0.004483648471792245
fdr | 0.004482005874273037
one step | 0.004481213182678407
26 \n nasal swabs | 0.004479429201563456
only the baseline specimen | 0.004478154421286813
a varied aetiology | 0.004477356302637909
aliquots | 0.004475543464872866
antihistamine | 0.004474576076625049
periungual desquamation | 0.004470897500760681
the production | 0.0044702061010071965
frequent, often bloody stools | 0.004470164797938312
laval | 0.004470058390012348
the two major digestive channels | 0.0

the literature | 0.003735031303827276
asia | 0.0037339870185352634
an open and sincere relationship | 0.0037335638380717627
bp)-is481 | 0.003730672829798403
purulent | 0.0037292083646063273
digoxigenin deoxy-uridine-5´-triphosphate | 0.0037276840467008457
researchers | 0.003727650463625329
keratoconjunctivitis | 0.0037216229187272323
dextromethorphan | 0.003720040138346374
atlanta | 0.0037187666770770187
3rd | 0.003718145206399537
109\n infantile colic | 0.0037180822951052694
the ability | 0.0037168466938691954
u/l. he | 0.0037135361311042858
reporting | 0.0037124996909107142
which components | 0.00371197417653871
levofloxacin | 0.0037105519938497172
rm | 0.003708312405320065
pcp | 0.003707274265379042
micu | 0.0037035107081648624
astrovirus | 0.003703167994736833
flaws | 0.0037026777685057145
the impact | 0.0037025980267304075
probenacid | 0.003701621526024976
calculation | 0.0037007939307313682
the precise route | 0.0036991213937186415
sne | 0.0036990258815687506
11 participating chi

postimmunization | 0.0027733044419707546
cpn)-16s-rrna | 0.0027719008710240254
infectiousness | 0.0027717247718788886
hivnoninfected | 0.0027714147514934255
disparity | 0.0027711041044711005
causation | 0.002770698438873255
(mild illness | 0.0027699650143146255
melbourne | 0.0027690131707118083
the tricyclic antidepressants | 0.002768722017664677
2 mg/ kgl | 0.0027682199519912494
hrvs | 0.002765828186987155
≈2 | 0.0027643845742910417
the road constellation | 0.002763612386838386
diarrhoea | 0.0027632446570081894
minnesota | 0.002761994877492115
colorado | 0.002761994877492115
(e.g., placebo herb | 0.002760252052948346
territories | 0.0027596427271582196
belgian | 0.0027581938760576213
a well-established cutoff | 0.002758127692516479
stata | 0.002757324513541623
the program | 0.0027568955575145854
mic | 0.0027563778536802115
the picture | 0.002755735454868523
dermatologists | 0.002755289396319152
mycobacterial | 0.002753940856215576
only one process | 0.002751405264402985
utensils | 0.0

public health preparedness | 0.05257457752399208
public health emergencies | 0.05187813294946772
other public health laws | 0.05019122425899112
public health preparedness capacity | 0.05008956177985917
public health disasters | 0.0499536518093788
public health care agencies | 0.04993569806771592
public health | 0.04926344234906828
potential public health emergencies | 0.04909223325197736
health risks | 0.048977233152065966
health care | 0.04832588378640533
health emergencies | 0.04823653082123437
public health system | 0.048195562612571234
global public health | 0.04803665976482011
human health care providers | 0.0480127253883783
global health preparedness | 0.04790435459600431
public health action | 0.04757678582823526
public health nurses | 0.047525003651570946
health care providers | 0.04747611401762843
new health risks | 0.047473545683563906
public health issues | 0.047414290026710264
public health agencies | 0.047379712813110014
many public health programs | 0.047226221282485215
o

the following:\n • public health director • environmental health specialist | 0.024363668399107943
20 other zoonotic arthropodborne infections | 0.02435755111622705
significant global resources | 0.02434331442499802
patient fl ow | 0.02432184121352542
poorer people | 0.024303150968980163
disasters | 0.024279162514093337
significant bites | 0.024240543796526248
patients and care providers | 0.024238413495040946
human-animal contact | 0.02423047989602558
their school disaster planning committee | 0.024228848440779556
previous studies | 0.024217999501437045
the local health-care system | 0.024217625756781273
afflicted countries | 0.024217549020588286
future disasters | 0.024199781092945974
the state and territorial public health workforce | 0.024146554867363303
national action plans | 0.02405721304944089
human beings | 0.024055286698847555
a health workforce | 0.024047337274736208
further studies | 0.024000748801174476
west african countries | 0.023993242319337183
public health preparedne

an airborne viral infection | 0.01714512609030142
economic impact | 0.017140034592728778
non-u.s. origin | 0.017132763757772236
most press reports | 0.017118164969665804
developing storms | 0.01711362135868399
a healthy human-animal bond | 0.017109895321737175
asymptomatic or mild disease | 0.01710622936846018
recent foreign travel | 0.01709985518590917
a public registry | 0.01709707974790428
chinese leaders | 0.017095916411609798
economic activity | 0.01708756625896288
several prevention and public health fund programs | 0.017087153689645133
valuable information | 0.017085341839203457
patients needs | 0.017084709200342028
a regular risk | 0.017076097628671894
stockpiling supplies | 0.01707525419277494
greater coordination | 0.01706681634085285
natural resources | 0.017066307509820543
pet store workers | 0.017063458730120336
possible surgical intervention | 0.017062936978774
the emergency indicators | 0.017059525896192553
industry groups | 0.017055834788459557
a fire emergency | 0.0170

the national blood supply | 0.013493837541659936
preventive measures | 0.01348307697795818
national healthcare safety network | 0.013474537694238727
additional investigation | 0.013474383410610136
physical injury | 0.013463552853662707
systematic analysis | 0.013462663949081292
equitable access | 0.013460394893827872
regional and sub-regional organizations | 0.0134590839300384
the local cdc | 0.013458556606927276
additional illness | 0.01345488142028557
a human host | 0.013454405783675846
vector control | 0.013449620510471567
numerous changes | 0.013446868705294827
the world bank data | 0.013445023841785947
council on global health crises | 0.01344326695880839
current vaccinations | 0.013442803139892599
professional societies | 0.013440583866456017
cdc\s action level | 0.01342251775257789
regional and sub-regional networks | 0.013420428175278314
central and western china | 0.01342008810943356
avian species | 0.01341936392161696
long term | 0.013402315037656692
individual researchers | 

the optimal prevention | 0.011191605001695384
the increased requirement | 0.011184756271352715
elevated liver function tests | 0.011184091780414364
urban planners | 0.01118320840446603
opportunities | 0.011181260839792389
severe pain.\n | 0.01117889312085105
suspected illness | 0.011175765012411347
data-based estimates | 0.011171363086339635
vaccination | 0.01116628228858089
the median times | 0.011161754748821447
growth promoters | 0.011159840932562367
a functional community coalition | 0.011157932939519409
poor and rural populations | 0.011151936335168871
personal career goals | 0.01114947817958544
ongoing illnesses | 0.011147358186873195
compendium of measures to prevent disease associated with animals in public settings | 0.011142476548498034
measures"\n − "active surveillance | 0.01113827827339113
the new students | 0.011137190709840681
the increasing pressure | 0.011129752425954702
figures | 0.011129335680681723
a traveling medical technician | 0.011129307138420277
psychological 

their own physical and emotional needs | 0.009122796113239146
critical self-reflection | 0.00911753998170123
all the prevention and control work | 0.009116558221753838
the get smart program | 0.009114952863481515
allied healthcare | 0.009111478019725509
addition | 0.009106683609238332
gis experts | 0.009105870138943713
diagnostic and preventive approaches | 0.009101648808646265
the interdisciplinary degree | 0.009101191956591441
a great deal | 0.009100423741573537
collection | 0.009099515167150725
cellphone-only households | 0.00909898754029341
the main issues | 0.009098620322872814
a general decreasing trend | 0.009098594655599167
full details | 0.00909715352075357
ongoing analgesics | 0.009096862033610091
the first 5-year plan | 0.009095903528317519
a social scientific synthesis | 0.00909539211510112
the same prison unit | 0.009092982120318237
equipment | 0.009086886521949437
more than one center | 0.009080944841979384
the lower end | 0.009080072462900315
impervious body suit | 0.009

drug diversion investigators | 0.007460195945642029
the possible gain | 0.007456933869575622
rats | 0.007454986541588172
improperly cooked poultry products | 0.007452291789731672
a stratified cluster sampling strategy | 0.007451295381878257
peter piot | 0.007449686813274913
gay men | 0.007449432925546706
triage | 0.007443519333846054
helsinki school of economics | 0.007443481275348604
healthier lifestyles | 0.007439334085197211
numerous subdisciplines | 0.007439220891879894
a very real issue | 0.007438102374088044
assessments.\n a major weakness | 0.007431892023758274
(e.g., documented cases | 0.007430775724881602
the antimicrobial value chain | 0.007428737375174356
exotic and wildlife pets | 0.007428417417991006
a "right" answer.\n ethical policies | 0.0074259114649160626
both the medical status | 0.0074252853697721185
weather prediction | 0.007421878847505819
teams | 0.0074216230162296015
torch | 0.007421576493921726
how do spatial designs impact behavior | 0.007411790564634117
assur

fetal complications | 0.005889717783929759
org/pets/pet_care/dog_care/stay_dog_bite_free/index | 0.005889070820009857
avian | 0.005888750880471792
no recent stay | 0.00588814182625301
an integral part | 0.005885960290630748
a particular subfield | 0.005882487661774939
mapping | 0.00587893334914202
the global outbreak alert and response network | 0.005878871774647283
geneva | 0.005875060237765222
gene transfer | 0.005872303688250426
frustration | 0.005872245224687798
sporothrix | 0.005872245224687798
hairdo | 0.005872245224687798
a university\s host region | 0.005870754228766777
the cultural geographer | 0.005869093391430352
χ 2 tests.\n a multivariate logistic regression model | 0.00586739183203386
the significantly lower mortality | 0.005867235925688203
the diverse networks | 0.005864791222842132
macroeconomic stability | 0.0058635676101353095
consultation | 0.005861604727169503
a confidential internal document | 0.005860257401352509
those products | 0.005859818472741615
the relative 

the executive directors | 0.0047202387039904925
polymicrobial flora | 0.004719349370156516
fig | 0.004717146559295705
railway tracks | 0.004712450276363544
nd | 0.004711826619140027
allergy | 0.004710812340294869
the normative framework | 0.004710252905909125
an airborne route | 0.004709455457954301
• humane society | 0.004708770819130505
chapter | 0.004707460533327452
a daily basis | 0.004703633802778376
utilitarian | 0.004702609565373486
cocoa | 0.004698817570856504
the sierra leone\ | 0.004698372530131411
the latest updates | 0.004698129800414288
a qualitative description | 0.004694838157342291
variations | 0.0046935325569779605
the third millennium | 0.004689654596393008
lcmv | 0.004689002450015905
whistle | 0.004688254430492046
excellence | 0.004687828076913356
november | 0.004683468121792767
contraction | 0.004682865373188903
who\s mandate | 0.004681722218169803
the tainted syringes | 0.004680408856097851
chances | 0.00467954345275054
a superficial and unsatisfying discussion | 0

streptococcus | 0.0036079776834417408
hamsters | 0.003607727973529557
territorial | 0.0036072462287528705
fraser | 0.003607028176008774
lebanese | 0.0036061351825099983
its functions | 0.0036057950708197335
(b-cell lines | 0.003602565948594705
an additional $170 million package | 0.003601447961354594
(t = −2.3 | 0.0036006198599262155
the extreme partisanship | 0.003596883631381812
versus | 0.0035963122509777602
cauchemez | 0.0035944347549131984
inciardi | 0.0035944347549131984
caregivers | 0.0035924576898229777
hawaii | 0.003592123788992689
accordance | 0.003591865129181049
its 2011 report-"the world | 0.0035888875311918156
wife | 0.003587576953202835
electrolytes | 0.003587130380555278
civets | 0.003584272962349455
more extensive information.\n | 0.0035809652795141815
concepts.\n | 0.0035800161545042775
discharge | 0.0035788556500785003
lagos | 0.003578197114613156
this.\n | 0.003576623795344008
appropriateness | 0.0035763383754565726
georgia | 0.0035754528973206768
the following webs

avma | 0.0022743107960063675
exposure.\n | 0.0022743107960063675
http://www.hsus | 0.0022743107960063675
htm | 0.0022743107960063675
http://www.cdc.gov/ncipc/duip/biteprevention.htm.\n | 0.0022743107960063675
regularly.\n | 0.0022743107960063675
reasons.\n | 0.0022743107960063675
organizations.\n | 0.0022743107960063675
difference?\n | 0.0022743107960063675
queue.\n | 0.0022743107960063675
bounds | 0.0022743107960063675
utility.\n | 0.0022743107960063675
it.\n | 0.0022743107960063675
how?\n | 0.0022743107960063675
despair | 0.0022743107960063675
speed | 0.0022743107960063675
numbers.\n | 0.0022743107960063675
weather.gov/crp/hurricane_harvey | 0.0022743107960063675
nigeria | 0.0022743107960063675
"\n | 0.0022743107960063675
year.\n | 0.0022743107960063675
threat:\n | 0.0022743107960063675
jacob | 0.0022743107960063675
ppe.html | 0.0022743107960063675
p=0.60 | 0.0022743107960063675
paid.\n | 0.0022743107960063675
the marketing | 0.0022729005137936266
the delivery | 0.002272209538067275


ValueError: [E088] Text of length 1721497 exceeds maximum of 1000000. The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [225]:
for cluster_num in range(0,10):
    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    texts =text_clusters[text_clusters['kmeans_cluster'] == cluster_num]['text_body'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
 
    print('-'*80)
    nlp = en_core_web_sm.load()

# add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    doc = nlp(str(texts))

    for sent in doc._.textrank.summary(limit_phrases=20, limit_sentences=5):
        print(sent)



CLUSTER #1
--------------------------------------------------------------------------------
To determine an association between seating proximity to the SARS patient and transmission of SARS, a study that included all airline passengers seated within four rows (i.e., front, back, and same row) of the index patient (4) was conducted.
IgG. The rest of 33 RNA specimens from SARS patients were labelled as recovering SARS (RS).
The only two unexplained specimens, RS18 and RS71 from the same patient, may represent a unique biological variability, accounting for the misclassification using this 52-gene molecular signature.\n To test the efficacy of using these 52 genes as the molecular signature for the severity of SARS patients, we identified a significant correlation (P < 1 × 10 -6 ) between the derived rank of SARS severity and the number of days after the onset of disease (Fig. 4a) .
The increase rate of SARS cases is expected to decrease with the cumulative SARS cases, which corresponds 

ValueError: [E088] Text of length 1721497 exceeds maximum of 1000000. The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [76]:
str1 = ''.join(texts)