In [123]:
from tqdm import tqdm
import scispacy
import spacy
import json
import re
import nltk
from operator import itemgetter

In [124]:
with open("../cleaned_data_json/sample.json") as f:
    data=json.load(f)

In [125]:
#Merged abstract and body-text of a paper as doc
text_data = [dt["abstract"].lower() for dt in data["text"] if dt['abstract']]

In [126]:
#Took 2000 Research Papers for modelling
print("TOTAL DOCS : ", len(text_data))

TOTAL DOCS :  1774


In [127]:
import unicodedata
def remove_accent_chars(text):
    text = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

In [128]:
def remove_special_characters(text, remove_digits=False):
    """This takes text as input and then finds whether each character is not a-z A-Z 0-9 and replaces them with nothing """
    pattern = r'[^a-zA-z\s]' if remove_digits else r'[^0-9a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [129]:
stop_words_found = "preprint license copyright author/funder word count text https doi figure holder data cases medrxiv biorxiv made time may study high total population number granted used using also international fig cc-by-nc-nd rights reserved peer-reviewed et al. medrxiv copyright auther/funder copyright copyrights pre print preprint = fig fig. figure "
stop_words_found = stop_words_found.split() #Best way to keep stop words
stop_words_found.extend(["abstract","perpetuity","authorfunder","license"])

In [132]:
def cleaner(doc):
    return " ".join(map(str.lower,(map(str,([token.lemma_ for token in doc if not token.is_stop and not token.is_space and not token.is_punct and not token.like_url and not token.like_email and token.text not in stop_words_found])))))
nlp = spacy.load("en_core_sci_sm", disable = ['tagger', 'parser','ner'], max_length=2000000)
nlp.add_pipe(cleaner,name="cleaner",first=True)
nlp.add_pipe(remove_accent_chars,name='accent_char_removal',after='cleaner')
nlp.add_pipe(remove_special_characters,name='remove_special_char',after='accent_char_removal')

In [133]:
text_data_cleaned = list(nlp.pipe(text_data))

In [134]:
text_data_cleaned[0]

'194 22 5168 23 24 25 reuse allow permission 27 positive strand rna genome picornaviruses comprise single large open read 28 frame flank 5 3 untranslated region utrs footandmouth disease virus fmdv 29 unusually large 5 utr 13 kb contain structural domain include 30 internal ribosome entry site ires facilitate initiation translation cisacting 31 replication element cre characterise structure 5 terminal 360 nucleotide 32 stemloop variable length polyctract approximately 100 200 nucleotide series 33 tandemly repeat pseudoknots pk investigate structure pk 34 selective 2 hydroxyl acetylation analyse primer extension shape analysis 35 determine contribution genome replication mutation deletion experiment 36 shape mutation experiment confirm importance previously predict pk 37 structure function deletion experiment show pk essential 38 replication provide genome competitive advantage 39 replicons fulllength genome lack pk replication competent infectious 40 virus rescue genome contain pk copy

In [135]:
"""Getting Top n-grams from corpus"""
def flatten_corpus(corpus):
    return " ".join([d.strip() for d in corpus])

In [136]:
def get_top_n_grams(corpus,ngram_val=1,limit=10):
    text_data_flatten = flatten_corpus(corpus)
    tokens = [token.text for token in nlp.tokenizer(text_data_flatten)]
    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),key=itemgetter(1), reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]
    return sorted_ngrams

In [137]:
get_top_n_grams(text_data_cleaned,ngram_val=2,limit=20)

[('95 ci', 139),
 ('public health', 131),
 ('reuse allow', 128),
 ('allow permission', 128),
 ('acute respiratory', 107),
 ('respiratory syndrome', 105),
 ('severe acute', 87),
 ('novel coronavirus', 86),
 ('infectious disease', 79),
 ('covid19 pandemic', 74),
 ('covid19 patient', 73),
 ('immune response', 72),
 ('coronavirus disease', 72),
 ('rna virus', 68),
 ('s protein', 67),
 ('amino acid', 67),
 ('40 available', 66),
 ('viral infection', 62),
 ('t cell', 60),
 ('influenza virus', 59)]

In [138]:
get_top_n_grams(text_data_cleaned,ngram_val=3,limit=20)

[('reuse allow permission', 128),
 ('severe acute respiratory', 82),
 ('acute respiratory syndrome', 72),
 ('allow permission display', 55),
 ('coronavirus disease 2019', 52),
 ('respiratory syndrome coronavirus', 50),
 ('40 available display', 46),
 ('middle east respiratory', 29),
 ('ccby 40 available', 28),
 ('east respiratory syndrome', 27),
 ('coronavirus 2 sarscov2', 25),
 ('polymerase chain reaction', 24),
 ('respiratory syndrome sars', 23),
 ('respiratory tract infection', 23),
 ('intensive care unit', 22),
 ('novel coronavirus 2019ncov', 21),
 ('disease 2019 covid19', 21),
 ('public health emergency', 21),
 ('syndrome coronavirus 2', 21),
 ('novel coronavirus sarscov2', 20)]

In [139]:
text_data_flatten = flatten_corpus(text_data_cleaned)
search = '95 ci'
print(len(search))

5


In [140]:
for i in range(0,len(text_data_flatten),5):
    if text_data_flatten[i:i+5] == search:
        print(text_data_flatten[i-20:i+28])

8 95 ci 144 214 346 95 ci 281 417 scenario asymp
5 ci 12 21and hr 23 95 ci 16 32 respectively arr
5 95 ci 12 19 hr 18 95 ci 13 25 respectively dem
vely dementia hr 12 95 ci 09 18 hr 18 95 ci 11 2
3 95 ci 10 17 hr 17 95 ci 12 25 respectively dia
vely diabetes hr 15 95 ci 13 19 hr 16 95 ci 11 2
4 95 ci 12 26 hr 16 95 ci 12 21 respectively cop
i 14 25 death hr 11 95 ci 07 17 previous use ace
sessment score sofa 95 ci 1374 2860 p  0001 whit
d 21 r0 estimate 44 95 ci 39 49 generalize growt
usehold contact 136 95 ci 47 295 nonhousehold fa
cvd hypertension 44 95 ci 264 747 37 95 ci 222 5
 saturation  88 699 95 ci 45 110 ddimer2500 69 9
2 admission hr 0704 95 ci 0546 0909 1 decrease p
uctive drop 232 076 95 ci 066 086 infect case es
p0012 ct value 0158 95 ci 0025 0987 p0048 pii 19
ial hospital hr 073 95 ci 054 099 illness hr 066
io low midlands 111 95 ci 107 114 high north eas
n difference 424 gl 95 ci 620 228 p0001 associat
eral involvement 84 95 ci 081 088 commonly invol
tial ct examination 

In [141]:
"""POINTWISE MUTUAL INFORMATION"""

'POINTWISE MUTUAL INFORMATION'

In [142]:
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

In [143]:
tokens = [token.text for token in nlp.tokenizer(text_data_flatten)]
finder = BigramCollocationFinder.from_documents([tokens])

In [144]:
bigram_measures = BigramAssocMeasures()
finder.nbest(bigram_measures.raw_freq, 10)

[('95', 'ci'),
 ('public', 'health'),
 ('allow', 'permission'),
 ('reuse', 'allow'),
 ('acute', 'respiratory'),
 ('respiratory', 'syndrome'),
 ('severe', 'acute'),
 ('novel', 'coronavirus'),
 ('infectious', 'disease'),
 ('covid19', 'pandemic')]

In [145]:
finder.nbest(bigram_measures.pmi, 10)

[('00049', '254'),
 ('0042', '0078'),
 ('0067', '0105'),
 ('0069', '0707'),
 ('0076', '0473'),
 ('0079', 's1a'),
 ('0102', '0982'),
 ('0165', '24279505520'),
 ('0167', '5877950950'),
 ('0176', '3098')]

In [146]:
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

In [147]:
finder = TrigramCollocationFinder.from_documents([tokens])
trigram_measures = TrigramAssocMeasures()
finder.nbest(trigram_measures.pmi, 10)

[('0069', '0707', 'p0011'),
 ('05ml', 'intra', '025ml'),
 ('0987', 'p0048', 'pii'),
 ('11525', '24053', '240167'),
 ('1187', '3079', 'p0008'),
 ('1206', '4580', 'p0012'),
 ('1279', '92927', 'p0029'),
 ('1354', '648', '2091'),
 ('13[12', 'inappetence', '11[10'),
 ('143173', '647', '112173')]

In [154]:
from gensim import corpora,models

In [155]:
tokenizer = spacy.load("en_core_sci_sm")
def get_tfidf_key_phrases(text_data_cleaned,top_n=10):
    text_noun_phrases = []
    for tdc in text_data_cleaned:
        noun_phrase = []
        for np in tokenizer(tdc).noun_chunks:
            noun_phrase.append(np.text)
        text_noun_phrases.append(noun_phrase)
    dictionary = corpora.Dictionary(text_noun_phrases)
    corpus = [dictionary.doc2bow(chunk) for chunk in text_noun_phrases]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    weighted_phrases = {dictionary.get(idx): value for doc in corpus_tfidf for idx, value in doc}
    weighted_phrases = sorted(weighted_phrases.items(),key=itemgetter(1), reverse=True)
    weighted_phrases = [(term, round(wt, 3)) for term, wt in weighted_phrases]
    return weighted_phrases[:top_n]

In [156]:
get_tfidf_key_phrases(text_data_cleaned, top_n=100)

[('pulmonary sample individual', 1.0),
 ('word', 1.0),
 ('covid19 sarscov2', 1.0),
 ('reliable estimate covid19 mortality crucial aid control strategy assess effectiveness intervention',
  1.0),
 ('motivation dna metabarcoding com', 1.0),
 ('severe acute respiratory syndrome coronavirus 2 sarscov2 disease', 1.0),
 ('reuse allow', 0.925),
 ('science', 0.925),
 ('pandemic spread', 0.734),
 ('rok bimodal distribution high morbidity', 0.707),
 ('standard gaussian distribution peak morbidity', 0.707),
 ('44 95 ci 39 49 generalize growth model', 0.707),
 ('estimate reproduction', 0.707),
 ('responsible current sarscov2', 0.707),
 ('zoonotic coronavirus cov infection', 0.707),
 ('1 inflammatory storm', 0.707),
 ('monocyte centric immune interaction 2 reveal singlecell analysis', 0.707),
 ('covid19 outbreak', 0.707),
 ('globally risk infection', 0.707),
 ('100 country report', 0.707),
 ('rapid global spread coronavirus disease covid19 unprecedented outbreak',
  0.707),
 ('1 precipitation seaso

In [157]:
import gensim

In [158]:
text_tokens = []
for doc in text_data_cleaned:
    tokens = []
    for t in tokenizer(doc):
        if len(t.text) == 1 or len(list(set(t.text))) == 1:
            pass
        else:
            tokens.append(t.text)
    text_tokens.append(tokens)

In [159]:
print(text_tokens[:2])

[['194', '5168', '23', '24', '25', 'reuse', 'allow', 'permission', '27', 'positive', 'strand', 'rna', 'genome', 'picornaviruses', 'comprise', 'single', 'large', 'open', 'read', '28', 'frame', 'flank', 'untranslated', 'region', 'utrs', 'footandmouth', 'disease', 'virus', 'fmdv', '29', 'unusually', 'large', 'utr', '13', 'kb', 'contain', 'structural', 'domain', 'include', '30', 'internal', 'ribosome', 'entry', 'site', 'ires', 'facilitate', 'initiation', 'translation', 'cisacting', '31', 'replication', 'element', 'cre', 'characterise', 'structure', 'terminal', '360', 'nucleotide', '32', 'stemloop', 'variable', 'length', 'polyctract', 'approximately', '100', '200', 'nucleotide', 'series', 'tandemly', 'repeat', 'pseudoknots', 'pk', 'investigate', 'structure', 'pk', '34', 'selective', 'hydroxyl', 'acetylation', 'analyse', 'primer', 'extension', 'shape', 'analysis', '35', 'determine', 'contribution', 'genome', 'replication', 'mutation', 'deletion', 'experiment', '36', 'shape', 'mutation', 'exp

In [160]:
bigram = gensim.models.Phrases(text_tokens, min_count=20, threshold=50,delimiter=b'_') # higher threshold fewer phrases.
bigram_model = gensim.models.phrases.Phraser(bigram)

In [161]:
norm_corpus_bigrams = [bigram_model[doc] for doc in text_tokens]

In [162]:
dictionary = gensim.corpora.Dictionary(norm_corpus_bigrams)
print('Sample word to number mappings:', list(dictionary.items())[:15])
print('Total Vocabulary Size:', len(dictionary))

Sample word to number mappings: [(0, '100'), (1, '13'), (2, '194'), (3, '200'), (4, '23'), (5, '24'), (6, '25'), (7, '27'), (8, '28'), (9, '29'), (10, '30'), (11, '31'), (12, '32'), (13, '34'), (14, '35')]
Total Vocabulary Size: 15378


In [165]:
dictionary.filter_extremes(no_below=3, no_above=0.9)

In [166]:
print('Total Vocabulary Size:', len(dictionary))

Total Vocabulary Size: 2792


In [167]:
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_bigrams]

In [169]:
print(bow_corpus[1][:50])
print([(dictionary[idx],frequency) for idx,frequency in bow_corpus[1][:50]])

[(0, 1), (24, 2), (50, 1), (56, 1), (68, 1), (85, 1), (99, 1), (100, 2), (101, 1), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1), (111, 1), (112, 1), (113, 1), (114, 1), (115, 1), (116, 1), (117, 1), (118, 1), (119, 2), (120, 1), (121, 1), (122, 1), (123, 1), (124, 1), (125, 1), (126, 1), (127, 2), (128, 1), (129, 1), (130, 1), (131, 1), (132, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 1), (138, 1), (139, 1), (140, 1), (141, 1), (142, 1)]
[('100', 1), ('analysis', 2), ('genome', 1), ('investigate', 1), ('presence', 1), ('show', 1), ('40_available', 1), ('affect', 2), ('assess', 1), ('available', 1), ('avoid', 1), ('base', 2), ('cause', 1), ('channel', 1), ('clearly', 1), ('consider', 1), ('continue', 1), ('coronavirus', 1), ('country', 1), ('currently', 1), ('display', 1), ('dissemination', 1), ('distress', 1), ('economy', 1), ('effort', 1), ('enormous', 1), ('epidemic', 2), ('examine', 1), ('exponentially', 1), ('find', 1), ('genetic',

In [170]:
print("Total number of papers",len(bow_corpus))

Total number of papers 1774


In [172]:
TOTAL_TOPICS = 10
lda_model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary,chunksize=1740, alpha='auto',eta='auto', random_state=42,
                                    iterations=500, num_topics=TOTAL_TOPICS,passes=20, eval_every=None)

In [173]:
for topic_id, topic in lda_model.print_topics(num_topics=10, num_words=20):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

Topic #1:
0.016*"system" + 0.014*"de" + 0.011*"development" + 0.009*"model" + 0.009*"la" + 0.009*"clinical" + 0.009*"provide" + 0.008*"approach" + 0.008*"design" + 0.007*"trial" + 0.007*"process" + 0.007*"network" + 0.006*"propose" + 0.006*"describe" + 0.006*"analysis" + 0.006*"need" + 0.005*"critical" + 0.005*"method" + 0.005*"develope" + 0.005*"information"

Topic #2:
0.021*"virus" + 0.018*"protein" + 0.015*"cell" + 0.011*"viral" + 0.011*"human" + 0.009*"infection" + 0.008*"host" + 0.008*"sarscov2" + 0.007*"coronavirus" + 0.006*"rna" + 0.005*"identify" + 0.005*"bind" + 0.005*"sequence" + 0.005*"result" + 0.005*"bat" + 0.005*"expression" + 0.004*"gene" + 0.004*"genome" + 0.004*"show" + 0.004*"target"

Topic #3:
0.023*"disease" + 0.013*"infection" + 0.011*"sarscov2" + 0.010*"transmission" + 0.010*"model" + 0.010*"outbreak" + 0.007*"individual" + 0.006*"infect" + 0.006*"spread" + 0.005*"covid19" + 0.005*"new" + 0.005*"virus" + 0.005*"result" + 0.005*"test" + 0.005*"risk" + 0.005*"viral"

In [177]:
import numpy as np
topics_coherences = lda_model.top_topics(bow_corpus,topn=20)
avg_coherence_score = np.mean([score[1] for score in topics_coherences])
print("avg coherence score: ",avg_coherence_score)

avg coherence score:  -2.515884063670638


In [178]:
topics_with_wts = [item[0] for item in topics_coherences]
print('LDA Topics with Weights')
print('='*50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #'+str(idx+1)+':')
    print([(term, round(wt, 3)) for wt, term in topic])
    print()

LDA Topics with Weights
Topic #1:
[('model', 0.041), ('epidemic', 0.023), ('estimate', 0.018), ('covid19', 0.018), ('outbreak', 0.015), ('china', 0.013), ('spread', 0.013), ('report', 0.01), ('parameter', 0.009), ('infection', 0.009), ('predict', 0.009), ('rate', 0.009), ('day', 0.009), ('dynamic', 0.008), ('transmission', 0.008), ('country', 0.008), ('2020', 0.008), ('prediction', 0.007), ('result', 0.007), ('method', 0.007)]

Topic #2:
[('disease', 0.023), ('infection', 0.013), ('sarscov2', 0.011), ('transmission', 0.01), ('model', 0.01), ('outbreak', 0.01), ('individual', 0.007), ('infect', 0.006), ('spread', 0.006), ('covid19', 0.005), ('new', 0.005), ('virus', 0.005), ('result', 0.005), ('test', 0.005), ('risk', 0.005), ('viral', 0.005), ('health', 0.004), ('early', 0.004), ('human', 0.004), ('day', 0.004)]

Topic #3:
[('virus', 0.021), ('protein', 0.018), ('cell', 0.015), ('viral', 0.011), ('human', 0.011), ('infection', 0.009), ('host', 0.008), ('sarscov2', 0.008), ('coronavirus

In [179]:
print('LDA Topics without Weights')
print('='*50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #'+str(idx+1)+':')
    print([term for wt, term in topic])
    print()

LDA Topics without Weights
Topic #1:
['model', 'epidemic', 'estimate', 'covid19', 'outbreak', 'china', 'spread', 'report', 'parameter', 'infection', 'predict', 'rate', 'day', 'dynamic', 'transmission', 'country', '2020', 'prediction', 'result', 'method']

Topic #2:
['disease', 'infection', 'sarscov2', 'transmission', 'model', 'outbreak', 'individual', 'infect', 'spread', 'covid19', 'new', 'virus', 'result', 'test', 'risk', 'viral', 'health', 'early', 'human', 'day']

Topic #3:
['virus', 'protein', 'cell', 'viral', 'human', 'infection', 'host', 'sarscov2', 'coronavirus', 'rna', 'identify', 'bind', 'sequence', 'result', 'bat', 'expression', 'gene', 'genome', 'show', 'target']

Topic #4:
['patient', 'covid19', 'age', 'infection', 'clinical', 'disease', 'high', '95_ci', 'rate', 'group', 'sarscov2', 'year', 'severe', 'risk', 'coronavirus', 'include', 'day', 'factor', 'respiratory', 'death']

Topic #5:
['virus', 'sequence', 'sample', 'rna', 'gene', 'detect', 'viral', 'assay', 'method', 'dete