### 1. Data Preprocessing

Topic modeling only the original forum post, not including the replies.

In [1]:
import pandas as pd
d = pd.read_csv("data/fiverr_tips/2020-10-11.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Store the thread titles into a dictionary format where the key is
is the index and the value is forum title.

In [2]:
# Complexity for this loop is less then O(n)
d_titles = dict()
i = 0
counter = 0

while(counter < d.size - 1):
    try:
        err = False
        # get thread_name which is the thread title 
        title = d["thread_name"][counter].lower()
        
        # get number of replies so we can quickly skip to the next post
        replies = int(d["replies"][counter])
        
    except:
        err = True
        
    # if there is no replies skip to the next post
    if(replies == 0):
        counter += 1
        d_titles[i] = title
        i += 1
    # if there are replies update index variable
    elif(err == False):
        counter += replies
        d_titles[i] = title
        i += 1
    else:
        counter += 1

Create a pandas data frame of the data

In [3]:
df = pd.DataFrame(list(d_titles.values()),columns = ['Title'], index = list(d_titles.keys()) ) 
df.head()

Unnamed: 0,Title
0,about the fiverr tips category
1,i am not getting order after get some order ca...
2,how to write a offer in buyer request
3,are you a new seller? this is how you get more...
4,important tips for new sellers for getting sta...


### 2. Tokenizing the data and converting it into a document-term matrix.

We declare function to pull out nouns and/or adjactives from a string of text
Source: https://github.com/adashofdata/nlp-in-python-tutorial

In [4]:
import nltk
from nltk import word_tokenize, pos_tag, punkt
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# For looking at only nouns
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

# For looking at only nouns AND adjactives 
def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

Apply the functions above to exract nouns, adjactives etc.

In [14]:
# For looking at only nouns
# data_clean = pd.DataFrame(df.Title.apply(nouns))

# For looking at only nouns AND adjactives
data_clean = pd.DataFrame(df.Title.apply(nouns_adj))

# For looking at all text
# data_clean = pd.DataFrame(df.Title)
data_clean = data_clean.iloc[:400000]
data_clean.head()

Unnamed: 0,Title
0,fiverr category
1,i order order cancelation
2,offer buyer request
3,new seller more orders
4,important tips new sellers


Create the document-term matrix using the CountVectorizer library

In [15]:
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said', 'covid', '19', 'covid-19', 'virus', 'corona', 'coronavirus']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cvn = CountVectorizer(stop_words=stop_words, max_df=.8)
#cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_clean.Title)
data_dtm = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtm.index = data_clean.index
data_dtm.head()

Unnamed: 0,00,000,01,10,100,101,10th,11,13,15,...,zealand,zero,zerooo,zip,zone,zones,zoom,écrit,оn,іmрасtѕ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3. Latent Dirichlet Allocation (LDA) for Topic Modeling

In [16]:
from gensim import matutils, models
import scipy.sparse

Convert the document-term matrix into term-document-matrix by taking the transpose

In [17]:
# One of the required inputs is a term-document matrix
tdm = data_dtm.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,399990,399991,399992,399993,399994,399995,399996,399997,399998,399999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus

Topic Modeling with LDA

In [19]:
# Create the gensim corpus (required input for the LDA model)
corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(tdm))

# Create the vocabulary dictionary (required input for the LDA model)
id2word = dict((v, k) for k, v in cvn.vocabulary_.items())

In [20]:
lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=150)
lda.print_topics()

  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


[(0,
  '0.000*"seller" + 0.000*"orders" + 0.000*"help" + 0.000*"level" + 0.000*"new" + 0.000*"need" + 0.000*"problem" + 0.000*"days" + 0.000*"month" + 0.000*"things"'),
 (1,
  '0.000*"buyer" + 0.000*"order" + 0.000*"request" + 0.000*"work" + 0.000*"review" + 0.000*"rating" + 0.000*"question" + 0.000*"feedback" + 0.000*"delivery" + 0.000*"customer"'),
 (2,
  '0.001*"fiverr" + 0.000*"gigs" + 0.000*"sales" + 0.000*"account" + 0.000*"money" + 0.000*"free" + 0.000*"business" + 0.000*"paypal" + 0.000*"best" + 0.000*"payment"'),
 (3,
  '0.000*"buyers" + 0.000*"sellers" + 0.000*"new" + 0.000*"tips" + 0.000*"logo" + 0.000*"tip" + 0.000*"good" + 0.000*"great" + 0.000*"advice" + 0.000*"design"'),
 (4,
  '0.001*"gig" + 0.000*"video" + 0.000*"page" + 0.000*"website" + 0.000*"facebook" + 0.000*"search" + 0.000*"extras" + 0.000*"portfolio" + 0.000*"description" + 0.000*"service"')]

## Topic Breakdown for all text:

<div>Topic 0 ==> </div>
<div>Topic 1 ==> </div>
<div>Topic 2 ==> </div>
<div>Topic 3 ==> </div>
<div>Topic 4==> </div>

### 4. Topic Identification for each post

In [21]:
# Identify which topics each transcript contains
corpus_transformed = lda[corpus]

Here we can see the probability distribution of each topic by post

In [None]:
for i in range(len(corpus_transformed)):
    print(corpus_transformed[i])

In [102]:
# We select the highest probability of each topic for each post
topics_by_post = []
for i in range(len(corpus_transformed)):
    max = -1
    for j in range(len(corpus_transformed[i])):
        if corpus_transformed[i][j][1] > max:
            max = corpus_transformed[i][j][1]
            topic_number = j
    topics_by_post. append(topic_number)

Finally we can see each forum post with their assigned topic

In [103]:
list(zip(topics_by_post, data_dtm.index))

[(3, 0),
 (4, 1),
 (0, 2),
 (0, 3),
 (2, 4),
 (0, 5),
 (4, 6),
 (2, 7),
 (4, 8),
 (4, 9),
 (1, 10),
 (0, 11),
 (3, 12),
 (2, 13),
 (0, 14),
 (3, 15),
 (2, 16),
 (3, 17),
 (3, 18),
 (4, 19),
 (0, 20),
 (0, 21),
 (3, 22),
 (2, 23),
 (4, 24),
 (0, 25),
 (3, 26),
 (0, 27),
 (4, 28),
 (1, 29),
 (2, 30),
 (4, 31),
 (3, 32),
 (0, 33),
 (0, 34),
 (0, 35),
 (0, 36),
 (2, 37),
 (4, 38),
 (3, 39),
 (2, 40),
 (4, 41),
 (1, 42),
 (2, 43),
 (0, 44),
 (2, 45),
 (0, 46),
 (1, 47),
 (4, 48),
 (0, 49),
 (1, 50),
 (0, 51),
 (0, 52),
 (1, 53),
 (2, 54),
 (2, 55),
 (0, 56),
 (3, 57),
 (3, 58),
 (2, 59),
 (2, 60),
 (2, 61),
 (4, 62),
 (0, 63),
 (0, 64),
 (0, 65),
 (1, 66),
 (0, 67),
 (4, 68),
 (0, 69),
 (2, 70),
 (0, 71),
 (3, 72),
 (3, 73),
 (1, 74),
 (4, 75),
 (0, 76),
 (3, 77),
 (1, 78),
 (0, 79),
 (2, 80),
 (0, 81),
 (2, 82),
 (0, 83),
 (3, 84),
 (3, 85),
 (1, 86),
 (0, 87),
 (1, 88),
 (1, 89),
 (0, 90),
 (4, 91),
 (2, 92),
 (3, 93),
 (4, 94),
 (4, 95),
 (1, 96),
 (2, 97),
 (3, 98),
 (0, 99),
 (2, 100),