In [1]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 3.2MB/s 
Collecting funcy
[?25l  Downloading https://files.pythonhosted.org/packages/ce/4b/6ffa76544e46614123de31574ad95758c421aae391a1764921b8a81e1eae/funcy-1.14.tar.gz (548kB)
[K     |████████████████████████████████| 552kB 19.5MB/s 
Building wheels for collected packages: pyLDAvis, funcy
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97711 sha256=c7785f7077aabf99c9e2ce93ccfa7e34a1edb505a000f735a7c78c5d96a2b7e1
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
  Building wheel for funcy (setup.py) ... [?25l[?25hdone
  Created wheel for funcy: filename=funcy-1.14-py2.py3-none-any.whl size=32042 sha256=506668b3

In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [4]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [5]:
# mount to Google drive
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/Shared drives/493B')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Import Dataset

In [6]:
raw_data = pd.DataFrame()

file_names = ['COVID_2', 'COVID_3', 'COVID_4', 'COVID_5']

for file in file_names:
    file_name = file + '.csv'
    text = pd.read_csv(file_name, usecols=['text'])
    raw_data = pd.concat([raw_data,text],axis=0)

In [7]:
raw_data.head(3)

Unnamed: 0,text
0,"""The study found that kidney dysfunction occur..."
1,Oh man! North Korea's first confirmed Coronavi...
2,Bill Gates: “Now we also face an immediate cri...


# Data pre-processing

In [8]:
tweets = raw_data.text.values.tolist()
tweets = [re.sub(r'\b(\w*[Cc][Oo][Vv][Ii][Dd]\w*)\b', '', tweet) for tweet in tweets]
pprint(tweets[:1])


['"The study found that kidney dysfunction occurs in 3 to 10 percent of novel '
 'coronavirus (-19) infection. In addition, acute damage to the kidneys occurs '
 'in seven percent of patients." ']


In [9]:
def tweets_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc = True)) # deacc = True removes punctuations

tweets_words = list(tweets_to_words(tweets))


In [10]:
print(tweets_words[:4])

[['the', 'study', 'found', 'that', 'kidney', 'dysfunction', 'occurs', 'in', 'to', 'percent', 'of', 'novel', 'coronavirus', 'infection', 'in', 'addition', 'acute', 'damage', 'to', 'the', 'kidneys', 'occurs', 'in', 'seven', 'percent', 'of', 'patients'], ['oh', 'man', 'north', 'korea', 'first', 'confirmed', 'coronavirus', 'patient', 'shot', 'dead', 'report', 'https', 'www', 'ibtimes', 'sg', 'apq', 'via', 'ibtimessg'], ['bill', 'gates', 'now', 'we', 'also', 'face', 'an', 'immediate', 'crisis', 'in', 'the', 'past', 'week', 'has', 'started', 'behaving', 'lot', 'like', 'the', 'once', 'in', 'century', 'pathogen', 'we', 've', 'been', 'worried', 'about', 'hope', 'it', 'not', 'that', 'bad', 'but', 'we', 'should', 'assume', 'it', 'will', 'be', 'until', 'we', 'know', 'otherwise'], ['let', 'stick', 'to', 'reality', 'there', 'still', 'no', 'cure', 'for', 'the', 'following', 'viruses', 'hiv', 'aids', 'herpes', 'rotavirus', 'sars', 'ebola', 'the', 'common', 'flu', 'dengue', 'rabies', 'but', 'we', 're',

In [11]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(tweets_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[tweets_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[tweets_words[0]]])



['the', 'study', 'found', 'that', 'kidney', 'dysfunction', 'occurs', 'in', 'to', 'percent', 'of', 'novel', 'coronavirus', 'infection', 'in', 'addition', 'acute', 'damage', 'to', 'the', 'kidneys', 'occurs', 'in', 'seven', 'percent', 'of', 'patients']


In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for tweet in texts:
        doc = nlp(" ".join(tweet)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
# Remove Stop Words
tweets_words_nostops = remove_stopwords(tweets_words)

# Form Bigrams
tweets_words_bigrams = make_bigrams(tweets_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
tweets_lemmatized = lemmatization(tweets_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(tweets_lemmatized[:1])

[['study', 'find', 'kidney', 'dysfunction', 'occur', 'percent', 'novel', 'infection', 'addition', 'acute', 'damage', 'kidney', 'occur', 'percent', 'patient']]


In [14]:
# Create Dictionary
id2word = corpora.Dictionary(tweets_lemmatized)

# Create Corpus
texts = tweets_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 2), (9, 1), (10, 2), (11, 1)]]


In [19]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
# Print the Keyword in the 5 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.022*"com" + 0.017*"go" + 0.014*"make" + 0.012*"pandemic" + 0.011*"amp" + '
  '0.010*"year" + 0.010*"many" + 0.010*"thank" + 0.010*"due" + 0.009*"world"'),
 (1,
  '0.022*"death" + 0.018*"case" + 0.015*"day" + 0.015*"test" + 0.015*"say" + '
  '0.014*"may" + 0.013*"new" + 0.012*"today" + 0.011*"die" + 0.011*"patient"'),
 (2,
  '0.022*"people" + 0.016*"help" + 0.014*"get" + 0.014*"work" + 0.013*"time" + '
  '0.011*"take" + 0.009*"need" + 0.009*"home" + 0.009*"lockdown" + '
  '0.008*"government"')]


In [21]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tweets_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.218318866092357

Coherence Score:  0.25623117585968375


In [22]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis