# Topic Modeling (Latent Dirichlet Allocation - LDA)

In [34]:
import pandas as pd
import gensim
import gensim.corpora as corpora
import pyLDAvis.gensim_models
import pyLDAvis
import pickle
import numpy as np

### Load data

In [35]:
clean_data = pd.read_csv("../data/preprocessed/bumble_preprocessed.csv")
clean_data['at'] = pd.to_datetime(clean_data['at'] )
clean_data['score'] = pd.to_numeric(clean_data['score'] )
clean_data.head(3)

Unnamed: 0,at,score,content,clean_content,sentiment
0,2022-03-28 23:33:04,5,Bumble rocks 👍🤞,bumble rock,0.0
1,2022-03-28 23:23:30,1,Just a cash grab. Congrats you have 2 new like...,cash grab congrats new like let swipe right fi...,0.071
2,2022-03-28 23:10:12,1,"Terrible, l have lost total faith in this app,...",terrible lost total faith app promised contact...,-0.118


### Build model

In [36]:
# Words to remove
extra_stopwords = [
    'app', 'bumble', 
    'get','got','make','would','want','need','see',
    'let','also','use','keep','say',
    'back','even','ever','every','still','really','lot'
    'account','profile', 'user','swipe', 'people',
    'one',
    'time',"month","months","years","week","date","days","day",'year',"hours",
    'someone',"everyone","another",'nothing', "anything",'everything',"something"
    ]

"""
match
day
message
one
first
good
pay
update
service
like
month
time
money
cool
customer
phone
week
subscription
woman
feature
connection
never
support
"""
# Create Dictionary
data_words = list(map(lambda x: [w for w in x.split() if w not in extra_stopwords], clean_data['clean_content']))
id2word = corpora.Dictionary(data_words) 

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_words]

In [37]:
# Build LDA model
num_topics = 5
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# # Save model
model_path = "./ldaModel_topics-"+str(num_topics)
with open(model_path, 'wb') as f:
    pickle.dump(lda_model, f)

In [38]:
# Prepare visualization

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
model_path = "./ldaModelvis_topics-"+str(num_topics)
with open(model_path, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)


pyLDAvis.enable_notebook()
LDAvis_prepared

  default_term_info = default_term_info.sort_values(
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = Loose

### Performance metrics

In [39]:
coherence = gensim.models.CoherenceModel(model=lda_model, corpus=corpus, dictionary=id2word, coherence='u_mass').get_coherence()
perplexity = lda_model.log_perplexity(corpus)
print(f"Coherence: {coherence}")
print(f"Perplexity: {perplexity}")

Coherence: -3.1359113085104164
Perplexity: -7.537240888035917


### Get top words for each topic

In [40]:
# Choose number of words
num_words = 30

In [41]:
# Order to match visualization
order = pyLDAvis.PreparedData.to_dict(LDAvis_prepared)['topic.order']
topD = pd.DataFrame({'old': order, 'new': range(1,len(order)+1)})
topD.sort_values(by=['old'], inplace = True)

num_topics =lda_model.num_topics
a = lda_model.show_topics(num_topics=num_topics,formatted=False,num_words=num_words)
b = lda_model.top_topics(corpus,dictionary=id2word,topn=num_words) # This orders the topics in the decreasing order of coherence score

topic2skillb = {}
topic2csb = {}
topic2skilla = {}
topic2csa = {}

cnt =1

for ws in b:
    wset = set(w[1] for w in ws[0])
    topic2skillb[cnt] = wset
    topic2csb[cnt] = ws[1]
    cnt +=1

for ws in a:
    wset = set(w[0]for w in ws[1])
    topic2skilla[ws[0]+1] = wset

for i in range(1,num_topics+1):
    j = 1
    cont = True
    while j < num_topics+1 and cont:
        if topic2skillb[j]==topic2skilla[i]:
            topic2csa[i] = topic2csb[j]
            cont = False
        j+=1
        

finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topD['new']
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2skilla.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)

In [42]:
finalData

Unnamed: 0,Topic,words,cs
0,Topic1,"{like, easy, great, dating, seems, work, know,...",-2.938695
4,Topic5,"{like, great, free, dating, feature, apps, men...",-3.008517
3,Topic4,"{like, could, paying, charged, free, dating, f...",-3.294687
1,Topic2,"{like, free, support, work, know, paid, notifi...",-3.309188
2,Topic3,"{like, deleted, add, email, great, facebook, r...",-3.479934


### Get representative reviews for each topic

In [43]:
# Document - Topic probability matrix
lda_output = lda_model[corpus]
df_document_topic = pd.DataFrame(np.zeros((len(clean_data),lda_model.num_topics)))
df_document_topic.columns = df_document_topic.columns+1
c = 0
for o in lda_output:
    idx = list(map(lambda x: int(topD.loc[topD['old']==x[0]+1,'new']), o))
    val = list(map(lambda x: round(x[1],3), o))
    df_document_topic.loc[c,idx] = val
    c+=1

In [44]:
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
max_proba = np.max(df_document_topic.values, axis = 1)

# Styling
def color_green(val):
    color = 'green' if val > .15 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .15 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)

Unnamed: 0,1,2,3,4,5
0,0.1,0.103,0.1,0.101,0.596
1,0.762,0.019,0.019,0.019,0.182
2,0.015,0.942,0.015,0.015,0.015
3,0.595,0.1,0.102,0.101,0.102
4,0.011,0.011,0.954,0.011,0.011
5,0.026,0.026,0.472,0.025,0.451
6,0.01,0.959,0.01,0.01,0.01
7,0.595,0.1,0.102,0.101,0.102
8,0.041,0.448,0.041,0.43,0.041
9,0.041,0.041,0.34,0.041,0.538


In [45]:
rep_docs_lda = pd.DataFrame()
n_examples = 3

for c in df_document_topic.columns:
    aux = df_document_topic[c].copy()
    aux.sort_values(ascending=False, inplace = True)
    ix = aux.index.values
    ix = ix[:n_examples]
    rep_docs_lda[c] = clean_data.loc[ix,'clean_content'].values
    print("Topic ", c)
    for i in np.arange(n_examples):
        print(rep_docs_lda[c][i], end="\n\n")

Topic  1
concept fine doubt many profile actually active sure everyone look good mean much actively using app swipe right profile every day lucky get match every couple week time come empty even get match time either miss due time window ignore unmatch disappear within hour two wish exaggerating actual conversation match year multiple photo different angle including full body shot witty profile verified real reason getting least odd match like concept disappointed execution seems might better tinder least get odd match actually interact people time

app waste time span day swiped right roughly around woman different woman day find matched neither ever took kind initiative talk communicate back guy feel app designed seems window shopping app woman guy able start conversation hope girl think attractive enough talk first place willing spend entire day swiping right woman day week gone different girl likely reply back incredibly good looking guy six pack app probably otherwise bother waste