# Topic Modeling (Latent Dirichlet Allocation - LDA)

In [1]:
import pandas as pd
import gensim
import gensim.corpora as corpora
import pyLDAvis.gensim_models
import pyLDAvis
import pickle
import numpy as np
import matplotlib.pyplot as plt

### Load data

In [2]:
clean_data = pd.read_csv("../data/preprocessed/bumble_preprocessed.csv")
clean_data['at'] = pd.to_datetime(clean_data['at'] )
clean_data['score'] = pd.to_numeric(clean_data['score'] )
clean_data.head(3)

Unnamed: 0,at,score,content,clean_content,sentiment,language,my
0,2022-03-28 23:33:04,5,Bumble rocks 👍🤞,bumble rock,0.0,eng,2022-03
1,2022-03-28 23:23:30,1,Just a cash grab. Congrats you have 2 new like...,cash grab congrats new like let swipe right fi...,0.071,eng,2022-03
2,2022-03-28 23:10:12,1,"Terrible, l have lost total faith in this app,...",terrible lost total faith app promised contact...,-0.118,lnc,2022-03


### Build model

In [4]:
# Words to remove
extra_stopwords = ['app', 'bumble', 'account', 'profile', 'swipe', 'people', 'match', 'like', 'get']

# Create Dictionary
data_words = list(map(lambda x: [w for w in x.split() if w not in extra_stopwords], clean_data['clean_content']))
id2word = corpora.Dictionary(data_words) 

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_words]

In [52]:
# Build LDA model
num_topics = 5
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Save model
model_path = "./ldaModel_topics-"+str(num_topics)
with open(model_path, 'wb') as f:
    pickle.dump(lda_model, f)

In [53]:
# Prepare visualization

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
model_path = "./ldaModelvis_topics-"+str(num_topics)
with open(model_path, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)


pyLDAvis.enable_notebook()
LDAvis_prepared

  by='saliency', ascending=False).head(R).drop('saliency', 1)


### Performance metrics

In [54]:
coherence = gensim.models.CoherenceModel(model=lda_model, corpus=corpus, dictionary=id2word, coherence='u_mass').get_coherence()
perplexity = lda_model.log_perplexity(corpus)
print(f"Coherence: {coherence}")
print(f"Perplexity: {perplexity}")

Coherence: -2.699859157848088
Perplexity: -7.508360510361607


### Get top words for each topic

In [8]:
# Choose number of words
num_words = 10

In [10]:
# Order to match visualization
order = pyLDAvis.PreparedData.to_dict(LDAvis_prepared)['topic.order']
topD = pd.DataFrame({'old': order, 'new': range(1,len(order)+1)})
topD.sort_values(by=['old'], inplace = True)

num_topics =lda_model.num_topics
a = lda_model.show_topics(num_topics=num_topics,formatted=False,num_words=num_words)
b = lda_model.top_topics(corpus,dictionary=id2word,topn=num_words) # This orders the topics in the decreasing order of coherence score

topic2skillb = {}
topic2csb = {}
topic2skilla = {}
topic2csa = {}

cnt =1

for ws in b:
    wset = set(w[1] for w in ws[0])
    topic2skillb[cnt] = wset
    topic2csb[cnt] = ws[1]
    cnt +=1

for ws in a:
    wset = set(w[0]for w in ws[1])
    topic2skilla[ws[0]+1] = wset

for i in range(1,num_topics+1):
    j = 1
    cont = True
    while j < num_topics+1 and cont:
        if topic2skillb[j]==topic2skilla[i]:
            topic2csa[i] = topic2csb[j]
            cont = False
        j+=1
        

finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topD['new']
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2skilla.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)

In [88]:
finalData.to_csv('./topWords_topics.csv')

In [None]:
# Reload model
num_topics = 5
model_path = "./ldaModel_topics-"+str(num_topics)
with open(model_path, 'rb') as f:
    lda_model = pickle.load(f)

In [None]:
model_path = "./ldaModelvis_topics-"+str(num_topics)
with open(model_path, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

### Get representative reviews for each topic

In [11]:
# Document - Topic probability matrix
lda_output = lda_model[corpus]
df_document_topic = pd.DataFrame(np.zeros((len(clean_data),lda_model.num_topics)))
df_document_topic.columns = df_document_topic.columns+1
c = 0
for o in lda_output:
    idx = list(map(lambda x: int(topD.loc[topD['old']==x[0]+1,'new']), o))
    val = list(map(lambda x: round(x[1],3), o))
    df_document_topic.loc[c,idx] = val
    c+=1

In [59]:
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
max_proba = np.max(df_document_topic.values, axis = 1)

# Styling
def color_green(val):
    color = 'green' if val > .15 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .15 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)

Unnamed: 0,1,2,3,4,5
0,0.102,0.101,0.1,0.101,0.595
1,0.017,0.412,0.017,0.536,0.017
2,0.013,0.013,0.013,0.949,0.013
3,0.069,0.068,0.068,0.727,0.068
4,0.012,0.012,0.012,0.233,0.731
5,0.026,0.025,0.026,0.898,0.026
6,0.0,0.0,0.0,0.964,0.0
7,0.069,0.068,0.068,0.727,0.068
8,0.041,0.041,0.041,0.836,0.041
9,0.325,0.554,0.04,0.041,0.04


In [60]:
rep_docs_lda = pd.DataFrame()
n_examples = 10

for c in df_document_topic.columns:
    aux = df_document_topic[c].copy()
    aux.sort_values(ascending=False, inplace = True)
    ix = aux.index.values
    ix = ix[:n_examples]
    rep_docs_lda[c] = clean_data.loc[ix,'content'].values
    print("Topic ", c)
    for i in np.arange(n_examples):
        print(rep_docs_lda[c][i], end="\n\n")

Topic  1
The concept is fine, but I doubt many of the profiles are actually active. 
Sure, everyone looks good, but that doesn't mean much, when most of them 
aren't actively using the app. I can swipe right on 40-50 profiles every 
day, and I'm lucky to get 1-2 matches every couple of weeks. Most of the 
time I come up empty. Even when I do get matches, 90% of the time they 
either miss it, due to the time out window, ignore it, or unmatch/disappear 
within an hour or two. I wish that I was exaggerating, but I haven't had an 
actual conversation with a match in over a year. I have multiple photos 
from different angles, including full body shots, a witty profile, and am 
verified as real, so there's no reason why I shouldn't be getting at least 
the odd match here or there. I like the concept, but I'm disappointed with 
the execution. It seems I might be better off with Tinder, where at least I 
get the odd match, and actually interact with people most of the time.

UPDATE: Im having 

### Get topic for each review

In [12]:
dominant_topic = np.argmax(df_document_topic.values, axis=1)
clean_data['dominant_topic'] = dominant_topic

In [18]:
clean_data.to_csv('../data/preprocessed/bumble_preprocessed.csv', index = False)