In [1]:
%reload_kedro

2020-02-27 13:25:24,511 - root - INFO - ** Kedro project Dynamic Topic Modeling
2020-02-27 13:25:24,512 - root - INFO - Defined global variable `context` and `catalog`


In this notebook, we introduce topic modeling methods through the [UN General Debates Dataset from Kaggle](https://www.kaggle.com/unitednations/un-general-debates), using [gensim](https://radimrehurek.com/gensim/).

In [2]:
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel, LdaModel
from gensim.corpora import Dictionary, MmCorpus

%matplotlib inline

In [3]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [22]:
pd.set_option('display.max_colwidth', 10000)

def most_relevant_docs(topic_number, lda_model, n, corpus):
    my_ids = np.linspace(start=1, stop=len(docs), num=len(docs))
    tops = sorted(zip(my_ids, corpus), reverse=True, key=lambda x: abs(dict(x[1]).get(topic_number, 0.0)))[:n]
    return([int(x[0]) for x in tops])

def topic_per(topic_number, lda_model, thres, corpus):
    cnt = 0
    for x in range(len(corpus)):
        for topic, pr in corpus[x]:
            if topic == topic_number and pr > thres:
                cnt+=1
    return("{0:.0%}".format(cnt / len(corpus)))

def topic_display(lda_model, num_docs, thres, corpus):
    topics = lda_model.show_topics(formatted=False, num_topics=20)
    df = pd.DataFrame([[idx, [x for x,y in words], "{0:.0%}".format(lda_model.alpha[idx]), most_relevant_docs(idx, lda_model, num_docs, corpus), topic_per(idx, lda_model, thres, corpus)] for idx,words in topics]).sort_index().drop(columns=[0])
    df.columns = ['Most relevant words', 'Prior', 'Most relevant docs', 'docs with probability > thres']
    return(df)

# Data

In [4]:
docs=pd.read_csv("../reviews_tinder_cleaned.csv")
print(docs.shape)
docs.head()

(362188, 4)


Unnamed: 0,review,stars,version,date
0,"Can't log in, even uninstalled it and tried to...",1.0,11.8.1,2020-02-14T11:06:04
1,"After long work, it stopped starting and began...",1.0,11.8.1,2020-02-14T10:58:04
2,Tinder is full of profiles offering SEX SERVIC...,2.0,11.8.1,2020-02-14T10:42:06
3,This was my experience with tinder. Even if so...,1.0,11.8.1,2020-02-14T10:23:38
4,trash. keep getting banned for no reason. read...,1.0,,2020-02-14T10:12:01


In [5]:
docs["date"] = docs["date"].apply(lambda x: x[:7])

In [6]:
docs = docs.reset_index().drop(columns=['stars', 'version', 'index'])
docs = docs.rename(columns={"review": "text", "date": "timestamp"})

# Preprocessing

In [7]:
from src.dynamic_topic_modeling.pipelines.data_processing.preprocess import preprocess_dataset

In [8]:
extreme_no_below=20
extreme_no_above=0.7
enable_bigram=True
min_bigram_count=20
basic_word_analysis=True

In [None]:
res = preprocess_dataset(docs, extreme_no_below, extreme_no_above,
                                                               enable_bigram, min_bigram_count, basic_word_analysis)

# Training

In [10]:
print('Number of unique tokens: %d' % len(res['dictionary']))
print('Number of documents: %d' % len(res['corpus']))

Number of unique tokens: 6841
Number of documents: 362188


In [38]:
# Set training parameters.
num_topics = 3
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = res['dictionary'][0]  # This is only to "load" the dictionary.
id2word = res['dictionary'].id2token

%time model = LdaModel(corpus=res['corpus'], \
                       id2word=id2word, \
                       chunksize=chunksize, \
                       alpha='auto', \
                       eta='auto', \
                       iterations=iterations, \
                       num_topics=num_topics, \
                       passes=passes, \
                       eval_every=eval_every)

CPU times: user 14min 26s, sys: 1.75 s, total: 14min 27s
Wall time: 14min 24s


In [39]:
%time topic_display(model, 2, 0.2, res['corpus'])

CPU times: user 4.71 s, sys: 136 ms, total: 4.85 s
Wall time: 3.02 s


Unnamed: 0,Most relevant words,Prior,Most relevant docs,docs with probability > thres
0,"[match, app, time, get, message, got, still, work, even, day]",117%,"[1, 3487]",0%
1,"[app, like, people, good, one, pay, get, profile, great, see]",192%,"[226507, 325690]",3%
2,"[tinder, account, money, banned, gold, use, month, reason, even, dont]",104%,"[325311, 222542]",2%


# Visualisation

In [42]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis
pyLDAvis.enable_notebook()

In [43]:
vis_data = gensimvis.prepare(model, res['corpus'], res['dictionary']) # MDS available : pcoa, tsne, mmds
pyLDAvis.display(vis_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [44]:
pyLDAvis.save_html(vis_data, "data/07_model_output/tinder_vis_lda.html")

# Linkedin

In [45]:
docs=pd.read_csv("../reviews_linkedin_cleaned.csv")
print(docs.shape)
docs.head()

(90091, 4)


Unnamed: 0,review,stars,version,date
0,No issues.,5.0,4.1.401,2020-02-16T07:08:34
1,"Installed on my device automatically, can't be removed or shut off. Stop forcing your app on people who don't want it.",1.0,1.0.0,2020-02-16T06:17:11
2,Very convenient and easy touse,5.0,4.1.405,2020-02-16T05:55:58
3,Excellant platform for B2B B2C professional opportunities .,5.0,,2020-02-16T05:55:19
4,Good to use.,1.0,4.1.388,2020-02-16T04:43:57


In [None]:
docs["date"] = docs["date"].apply(lambda x: x[:7])
docs = docs.reset_index().drop(columns=['stars', 'version', 'index'])
docs = docs.rename(columns={"review": "text", "date": "timestamp"})

res = preprocess_dataset(docs, extreme_no_below, extreme_no_above, enable_bigram, min_bigram_count, basic_word_analysis)

In [49]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = res['dictionary'][0]  # This is only to "load" the dictionary.
id2word = res['dictionary'].id2token

%time model = LdaModel(corpus=res['corpus'], \
                       id2word=id2word, \
                       chunksize=chunksize, \
                       alpha='auto', \
                       eta='auto', \
                       iterations=iterations, \
                       num_topics=num_topics, \
                       passes=passes, \
                       eval_every=eval_every)

CPU times: user 3min 5s, sys: 67.8 ms, total: 3min 6s
Wall time: 3min 5s


## Result for 3 topics

In [48]:
topic_display(model, 2, 0.2, res['corpus'])

Unnamed: 0,Most relevant words,Prior,Most relevant docs,docs with probability > thres
0,"[good, job, professional, nice, people, linkedin, application, one, experience, business]",102%,"[25857, 47225]",32%
1,"[app, great, best, work, love, useful, helpful, excellent, connection, thanks]",78%,"[29524, 30112]",1%
2,"[linkedin, update, use, time, even, phone, notification, account, message, post]",126%,"[504, 1666]",1%


## Result for 10 topics

In [50]:
topic_display(model, 2, 0.2, res['corpus'])

Unnamed: 0,Most relevant words,Prior,Most relevant docs,docs with probability > thres
0,"[linkedin, use, notification, account, message, post, need, please, profile, want]",303%,"[25857, 47225]",32%
1,"[nice, awesome, bad, informative, download, must, user friendly, wonderful, connecting, interface]",67%,"[29524, 30112]",1%
2,"[really, help, thanks, career, easy use, team, posting, human, fantastic, date]",69%,"[504, 1666]",1%
3,"[get, people, like, new, many, way, find, connection, contact, know]",165%,"[65181, 17693]",5%
4,"[app, good, great, job, professional, best, one, application, useful, experience]",357%,"[71267, 71569]",0%
5,"[work, well, job search, edit, ok, location, upload resume, resume, properly, apply]",62%,"[64224, 12598]",1%
6,"[update, time, even, phone, working, able, using, always, issue, page]",260%,"[15279, 17531]",0%
7,"[change, service, anything, happy, response, developer, others, resource, level, document]",67%,"[17506, 23216]",2%
8,"[love, uninstall, cant, wrong, video, sent, perfect, reply, suck, fine]",71%,"[2, 606]",0%
9,"[helpful, year, lot, getting, day, better, got, month, used, think]",108%,"[73100, 2]",0%
