### 1. Data Preprocessing

Topic modeling only the original forum post, not including the replies.

In [1]:
import pandas as pd
d = pd.read_csv("data/covid-19_discussions/2020-07-29.csv")

Store the thread titles into a dictionary format where the key is
is the index and the value is forum title.

In [8]:
# Complexity for this loop is less then O(n)
d_titles = dict()
i = 0
counter = 0

while(counter < d.size - 1):
    try:
        err = False
        # get thread_name which is the thread title 
        title = d["thread_name"][counter].lower()
        
        # get number of replies so we can quickly skip to the next post
        replies = int(d["replies"][counter])
        
    except:
        err = True
        
    # if there is no replies skip to the next post
    if(replies == 0):
        counter += 1
        d_titles[i] = title
        i += 1
    # if there are replies update index variable
    elif(err == False):
        counter += replies
        d_titles[i] = title
        i += 1
    else:
        counter += 1

Create a pandas data frame of the data

In [9]:
df = pd.DataFrame(list(d_titles.values()),columns = ['Title'], index = list(d_titles.keys()) ) 
df.head()

Unnamed: 0,Title
0,about the covid-19 discussions category
1,poll: just a few questions about your experien...
2,have you lowered your rates due to covid-19?
3,"no sales at all, maybe (covid-19) is the reason?"
4,support for smbs and freelancers during the sp...


### 2. Tokenizing the data and converting it into a document-term matrix.

We declare function to pull out nouns and/or adjactives from a string of text
Source: https://github.com/adashofdata/nlp-in-python-tutorial

In [11]:
import nltk
from nltk import word_tokenize, pos_tag, punkt
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

In [57]:
# For looking at only nouns
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

# For looking at only nouns AND adjactives 
def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

Apply the functions above to exract nouns, adjactives etc.

In [94]:
# For looking at only nouns
# data_clean = pd.DataFrame(df.Title.apply(nouns))

# For looking at only nouns AND adjactives
data_clean = pd.DataFrame(df.Title.apply(nouns_adj))

# For looking at all text
# data_clean = pd.DataFrame(df.Title)

data_clean.head()

Unnamed: 0,Title
0,covid-19 discussions category
1,poll few questions experiences covid-19
2,rates due
3,sales covid-19 reason
4,support smbs freelancers spread covid-19


Create the document-term matrix using the CountVectorizer library

In [95]:
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said', 'covid', '19', 'covid-19', 'virus', 'corona', 'coronavirus']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cvn = CountVectorizer(stop_words=stop_words, max_df=.8)
#cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_clean.Title)
data_dtm = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtm.index = data_clean.index
data_dtm.head()

Unnamed: 0,addresses,advantage,advise,affected,agree,alright,annihilation,anybody,apr,bangladesh,...,unemployment,update,vaccine,video,want,wealth,weeks,weird,work,world
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3. Latent Dirichlet Allocation (LDA) for Topic Modeling

In [96]:
from gensim import matutils, models
import scipy.sparse

Convert the document-term matrix into term-document-matrix by taking the transpose

In [97]:
# One of the required inputs is a term-document matrix
tdm = data_dtm.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,132,133,134,135,136,137,138,139,140,141
addresses,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
advantage,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
advise,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
affected,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
agree,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus

Topic Modeling with LDA

In [98]:
# Create the gensim corpus (required input for the LDA model)
corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(tdm))

# Create the vocabulary dictionary (required input for the LDA model)
id2word = dict((v, k) for k, v in cvn.vocabulary_.items())

In [93]:
lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=150)
lda.print_topics()

[(0,
  '0.035*"sales" + 0.035*"game" + 0.035*"gig" + 0.024*"work" + 0.024*"stay" + 0.024*"poll" + 0.024*"situation" + 0.024*"vaccine" + 0.013*"safe" + 0.013*"topic"'),
 (1,
  '0.036*"disease" + 0.025*"safe" + 0.025*"days" + 0.025*"world" + 0.025*"lockdown" + 0.024*"orders" + 0.013*"clearance" + 0.013*"order" + 0.013*"sales" + 0.013*"responders"'),
 (2,
  '0.128*"fiverr" + 0.038*"times" + 0.038*"freelancers" + 0.029*"effect" + 0.029*"orders" + 0.029*"order" + 0.020*"period" + 0.020*"fund" + 0.020*"quarantine" + 0.011*"market"'),
 (3,
  '0.023*"days" + 0.023*"jobs" + 0.023*"freelancing" + 0.023*"stress" + 0.023*"problems" + 0.013*"work" + 0.013*"quarantine" + 0.013*"isolation" + 0.013*"period" + 0.013*"mar"'),
 (4,
  '0.053*"new" + 0.033*"pandemic" + 0.033*"covid19" + 0.023*"home" + 0.023*"health" + 0.022*"stayy" + 0.022*"productive" + 0.022*"gigs" + 0.012*"impact" + 0.012*"quarantine"')]

## Topic Breakdown for all text:

<div>Topic 0 ==> </div>
<div>Topic 1 ==> </div>
<div>Topic 2 ==> </div>
<div>Topic 3 ==> </div>
<div>Topic 4==> </div>

### 4. Topic Identification for each post

In [99]:
# Identify which topics each transcript contains
corpus_transformed = lda[corpus]

Here we can see the probability distribution of each topic by post

In [100]:
for i in range(len(corpus_transformed)):
    print(corpus_transformed[i])

[(0, 0.06668125), (1, 0.066681564), (2, 0.066678904), (3, 0.73327786), (4, 0.06668041)]
[(0, 0.051369786), (1, 0.050011024), (2, 0.050009064), (3, 0.050010484), (4, 0.79859966)]
[(0, 0.5999086), (1, 0.10002476), (2, 0.10002031), (3, 0.10002354), (4, 0.10002283)]
[(0, 0.7327602), (1, 0.06695559), (2, 0.06667422), (3, 0.066934854), (4, 0.06667514)]
[(0, 0.040007643), (1, 0.04035999), (2, 0.8396177), (3, 0.04000742), (4, 0.040007204)]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(0, 0.100011215), (1, 0.10001145), (2, 0.1000094), (3, 0.10001089), (4, 0.599957)]
[(0, 0.100008085), (1, 0.10124691), (2, 0.59872955), (3, 0.10000785), (4, 0.100007616)]
[(0, 0.03333925), (1, 0.033392813), (2, 0.033542637), (3, 0.03358357), (4, 0.86614174)]
[(0, 0.040006652), (1, 0.040315725), (2, 0.040005587), (3, 0.040051065), (4, 0.839621)]
[(0, 0.040007737), (1, 0.83997095), (2, 0.040006496), (3, 0.040007513), (4, 0.04000729)]
[(0, 0.7312725), (1, 0.06668111), (2, 0.06667853), (3, 0.067698814), (4, 0.0

In [102]:
# We select the highest probability of each topic for each post
topics_by_post = []
for i in range(len(corpus_transformed)):
    max = -1
    for j in range(len(corpus_transformed[i])):
        if corpus_transformed[i][j][1] > max:
            max = corpus_transformed[i][j][1]
            topic_number = j
    topics_by_post. append(topic_number)

Finally we can see each forum post with their assigned topic

In [103]:
list(zip(topics_by_post, data_dtm.index))

[(3, 0),
 (4, 1),
 (0, 2),
 (0, 3),
 (2, 4),
 (0, 5),
 (4, 6),
 (2, 7),
 (4, 8),
 (4, 9),
 (1, 10),
 (0, 11),
 (3, 12),
 (2, 13),
 (0, 14),
 (3, 15),
 (2, 16),
 (3, 17),
 (3, 18),
 (4, 19),
 (0, 20),
 (0, 21),
 (3, 22),
 (2, 23),
 (4, 24),
 (0, 25),
 (3, 26),
 (0, 27),
 (4, 28),
 (1, 29),
 (2, 30),
 (4, 31),
 (3, 32),
 (0, 33),
 (0, 34),
 (0, 35),
 (0, 36),
 (2, 37),
 (4, 38),
 (3, 39),
 (2, 40),
 (4, 41),
 (1, 42),
 (2, 43),
 (0, 44),
 (2, 45),
 (0, 46),
 (1, 47),
 (4, 48),
 (0, 49),
 (1, 50),
 (0, 51),
 (0, 52),
 (1, 53),
 (2, 54),
 (2, 55),
 (0, 56),
 (3, 57),
 (3, 58),
 (2, 59),
 (2, 60),
 (2, 61),
 (4, 62),
 (0, 63),
 (0, 64),
 (0, 65),
 (1, 66),
 (0, 67),
 (4, 68),
 (0, 69),
 (2, 70),
 (0, 71),
 (3, 72),
 (3, 73),
 (1, 74),
 (4, 75),
 (0, 76),
 (3, 77),
 (1, 78),
 (0, 79),
 (2, 80),
 (0, 81),
 (2, 82),
 (0, 83),
 (3, 84),
 (3, 85),
 (1, 86),
 (0, 87),
 (1, 88),
 (1, 89),
 (0, 90),
 (4, 91),
 (2, 92),
 (3, 93),
 (4, 94),
 (4, 95),
 (1, 96),
 (2, 97),
 (3, 98),
 (0, 99),
 (2, 100),