In [21]:
import os
import numpy as np
import pandas as pd
import gensim
from gensim import models
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [9]:
# load data
path = 'c:/Users/bill/Documents/projects/data/covid19/open_research'
all_sources = pd.read_csv(os.path.join(path, 'metadata.csv'), low_memory=False)

In [41]:
stop=set(stopwords.words('english'))

#stem = PorterStemmer()
lem = WordNetLemmatizer()

def to_tokens(sentence):
    words=[ w for w in word_tokenize(sentence) if (w not in stop) ]
    #words=[ stem.stem(lem.lemmatize(w)) for w in words if len(w) > 2 ]
    words=[ lem.lemmatize(w) for w in words if len(w) > 2 ]
    return words

corpus = []
for news in all_sources['title'].dropna()[:5000]:
    corpus.append(to_tokens(news))

dic = gensim.corpora.Dictionary(corpus)
# (token, count) for each word in the sentence
bow_corpus = [ dic.doc2bow(doc) for doc in corpus ]

In [40]:
for i, v in enumerate(bow_corpus[:5]):
    print(i, v)

0 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]
1 [(6, 1), (7, 1), (8, 1)]
2 [(2, 1), (3, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]
3 [(16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]
4 [(4, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]


In [42]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = dic,                                    
                                   passes = 10,
                                   workers = 2)

In [43]:
# should identified topics
lda_model.show_topics()

[(0,
  '0.035*"Chapter" + 0.009*"Viral" + 0.008*"Diseases" + 0.008*"The" + 0.006*"Infections" + 0.006*"Virus" + 0.004*"Infectious" + 0.004*"virus" + 0.004*"Respiratory" + 0.003*"China"'),
 (1,
  '0.023*"virus" + 0.012*"cell" + 0.012*"protein" + 0.012*"coronavirus" + 0.010*"infection" + 0.007*"disease" + 0.006*"antibody" + 0.006*"infectious" + 0.005*"gastroenteritis" + 0.005*"porcine"'),
 (2,
  '0.013*"The" + 0.010*"respiratory" + 0.009*"syndrome" + 0.007*"acute" + 0.007*"Chapter" + 0.006*"health" + 0.006*"SARS" + 0.005*"virus" + 0.005*"severe" + 0.005*"patient"'),
 (3,
  '0.008*"Chapter" + 0.008*"Acute" + 0.007*"Respiratory" + 0.005*"infection" + 0.005*"volume" + 0.005*"Subject" + 0.005*"virus" + 0.005*"viral" + 0.005*"RNA" + 0.004*"Disease"')]

In [44]:
# base corpus is 2, and return topic score with the top 10 keywords
for index, score in sorted(lda_model[bow_corpus[2]], key=lambda x: -1 * x[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9214218854904175	 
Topic: 0.008*"Chapter" + 0.008*"Acute" + 0.007*"Respiratory" + 0.005*"infection" + 0.005*"volume" + 0.005*"Subject" + 0.005*"virus" + 0.005*"viral" + 0.005*"RNA" + 0.004*"Disease"

Score: 0.027481980621814728	 
Topic: 0.023*"virus" + 0.012*"cell" + 0.012*"protein" + 0.012*"coronavirus" + 0.010*"infection" + 0.007*"disease" + 0.006*"antibody" + 0.006*"infectious" + 0.005*"gastroenteritis" + 0.005*"porcine"

Score: 0.025571420788764954	 
Topic: 0.013*"The" + 0.010*"respiratory" + 0.009*"syndrome" + 0.007*"acute" + 0.007*"Chapter" + 0.006*"health" + 0.006*"SARS" + 0.005*"virus" + 0.005*"severe" + 0.005*"patient"

Score: 0.025524664670228958	 
Topic: 0.035*"Chapter" + 0.009*"Viral" + 0.008*"Diseases" + 0.008*"The" + 0.006*"Infections" + 0.006*"Virus" + 0.004*"Infectious" + 0.004*"virus" + 0.004*"Respiratory" + 0.003*"China"


In [49]:
# predict
sentences = 'covid19 is a deadly virus that causes respiratory infection'
for index, score in sorted(lda_model[dic.doc2bow(to_tokens(sentences))], key=lambda x: -1 * x[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8395504951477051	 
Topic: 0.023*"virus" + 0.012*"cell" + 0.012*"protein" + 0.012*"coronavirus" + 0.010*"infection" + 0.007*"disease" + 0.006*"antibody" + 0.006*"infectious" + 0.005*"gastroenteritis" + 0.005*"porcine"

Score: 0.058635156601667404	 
Topic: 0.013*"The" + 0.010*"respiratory" + 0.009*"syndrome" + 0.007*"acute" + 0.007*"Chapter" + 0.006*"health" + 0.006*"SARS" + 0.005*"virus" + 0.005*"severe" + 0.005*"patient"

Score: 0.05129947513341904	 
Topic: 0.008*"Chapter" + 0.008*"Acute" + 0.007*"Respiratory" + 0.005*"infection" + 0.005*"volume" + 0.005*"Subject" + 0.005*"virus" + 0.005*"viral" + 0.005*"RNA" + 0.004*"Disease"

Score: 0.050514884293079376	 
Topic: 0.035*"Chapter" + 0.009*"Viral" + 0.008*"Diseases" + 0.008*"The" + 0.006*"Infections" + 0.006*"Virus" + 0.004*"Infectious" + 0.004*"virus" + 0.004*"Respiratory" + 0.003*"China"


In [24]:
# converting from count to tfidf
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [30]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                   num_topics = 4, 
                                   id2word = dic,                                    
                                   passes = 10,
                                   workers = 2)

In [31]:
# base corpus is 2, and return topic score with the top 10 keywords
for index, score in sorted(lda_model_tfidf[corpus_tfidf[2]], key=lambda x: -1 * x[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.7727094888687134	 
Topic: 0.006*"volume" + 0.005*"Contents" + 0.005*"index" + 0.004*"Subject" + 0.003*"virus" + 0.003*"Index" + 0.002*"Response" + 0.002*"SARS" + 0.002*"coronavirus" + 0.002*"disease"

Score: 0.08062256872653961	 
Topic: 0.006*"Respiratory" + 0.006*"Chapter" + 0.005*"Infections" + 0.005*"Viral" + 0.004*"Acute" + 0.003*"The" + 0.003*"Diseases" + 0.003*"protein" + 0.002*"virus" + 0.002*"Severe"

Score: 0.07675193250179291	 
Topic: 0.003*"virus" + 0.003*"The" + 0.003*"infection" + 0.002*"Chapter" + 0.002*"coronavirus" + 0.002*"protein" + 0.002*"feline" + 0.002*"infectious" + 0.002*"patient" + 0.002*"bronchitis"

Score: 0.06991596519947052	 
Topic: 0.004*"Chapter" + 0.003*"The" + 0.003*"health" + 0.003*"SARS" + 0.002*"Diseases" + 0.002*"Health" + 0.002*"disease" + 0.002*"respiratory" + 0.002*"Infectious" + 0.002*"public"
