### Big thanks to Susan Li for the awesome tutorial on LDA
[https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24]

This Notebook was inspired by the Medium article above and is written by Advith Chegu for HackDown 2020. Some more sources:

[https://medium.com/@osas.usen/topic-extraction-from-tweets-using-lda-a997e4eb0985]

[https://www.kaggle.com/therohk/million-headlines/data]

In [36]:
import pandas as pd
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
data = pd.read_json('News_Category_Dataset_v2.json', lines=True)
data['to_vec'] = data[['category', 'headline', 'short_description']].agg(' '.join, axis=1)
data.set_index('headline', drop=True, append=False, inplace=False, verify_integrity=False)
data['index'] = data.index
documents = data

In [37]:
print(len(documents))
print(documents[:5])

200853
           authors       category       date  \
0  Melissa Jeltsen          CRIME 2018-05-26   
1    Andy McDonald  ENTERTAINMENT 2018-05-26   
2       Ron Dicker  ENTERTAINMENT 2018-05-26   
3       Ron Dicker  ENTERTAINMENT 2018-05-26   
4       Ron Dicker  ENTERTAINMENT 2018-05-26   

                                            headline  \
0  There Were 2 Mass Shootings In Texas Last Week...   
1  Will Smith Joins Diplo And Nicky Jam For The 2...   
2    Hugh Grant Marries For The First Time At Age 57   
3  Jim Carrey Blasts 'Castrato' Adam Schiff And D...   
4  Julianna Margulies Uses Donald Trump Poop Bags...   

                                                link  \
0  https://www.huffingtonpost.com/entry/texas-ama...   
1  https://www.huffingtonpost.com/entry/will-smit...   
2  https://www.huffingtonpost.com/entry/hugh-gran...   
3  https://www.huffingtonpost.com/entry/jim-carre...   
4  https://www.huffingtonpost.com/entry/julianna-...   

                              

In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/advithchegu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
stemmer = SnowballStemmer("english")
stop_words = ['http']

In [39]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words and '@' not in token:
            result.append(lemmatize_stemming(token))
    return result

In [40]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Laura', 'Paddison']


 tokenized and lemmatized document: 
['laura', 'paddison']


In [41]:
processed_docs = documents['to_vec'].map(preprocess)
processed_docs[:10]

0    [crime, mass, shoot, texa, week, leav, husband...
1    [entertain, smith, join, diplo, nicki, world, ...
2    [entertain, hugh, grant, marri, time, actor, l...
3    [entertain, carrey, blast, castrato, adam, sch...
4    [entertain, julianna, marguli, use, donald, tr...
5    [entertain, morgan, freeman, devast, sexual, h...
6    [entertain, donald, trump, lovin, mcdonald, ji...
7    [entertain, watch, amazon, prime, week, great,...
8    [entertain, mike, myer, reveal, like, fourth, ...
9    [entertain, watch, hulu, week, get, recent, ac...
Name: to_vec, dtype: object

In [42]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 america
1 children
2 crime
3 husband
4 kill
5 leav
6 mass
7 shoot
8 texa
9 week
10 cours


In [43]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [44]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(121, 1), (187, 1), (323, 1), (949, 1), (959, 1), (3684, 1), (4346, 1)]

In [45]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 121 ("impact") appears 1 time.
Word 187 ("women") appears 1 time.
Word 323 ("face") appears 1 time.
Word 949 ("number") appears 1 time.
Word 959 ("intern") appears 1 time.
Word 3684 ("handi") appears 1 time.
Word 4346 ("inevit") appears 1 time.


In [46]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.28118952113002166),
 (1, 0.27998207942225095),
 (2, 0.27737831958734893),
 (3, 0.3605228811876239),
 (4, 0.30418734124514596),
 (5, 0.2856753625347595),
 (6, 0.4125666899870586),
 (7, 0.30562807249758883),
 (8, 0.37797091116476994),
 (9, 0.23453113858671673)]


In [47]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=40, id2word=dictionary, passes=2, workers=2)

In [48]:
print(lda_model)

LdaModel(num_terms=11070, num_topics=40, decay=0.5, chunksize=2000)


In [49]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.039*"click" + 0.038*"yoga" + 0.033*"shop" + 0.023*"publish" + 0.023*"sit" + 0.022*"wine" + 0.021*"breath" + 0.020*"entir" + 0.018*"sexual" + 0.018*"blogger"
Topic: 1 
Words: 0.065*"busi" + 0.023*"social" + 0.021*"compani" + 0.021*"tech" + 0.018*"bank" + 0.015*"collect" + 0.014*"memori" + 0.014*"impact" + 0.013*"network" + 0.012*"develop"
Topic: 2 
Words: 0.183*"well" + 0.157*"food" + 0.031*"stress" + 0.028*"holiday" + 0.016*"eat" + 0.015*"best" + 0.011*"tradit" + 0.010*"time" + 0.010*"way" + 0.009*"fast"
Topic: 3 
Words: 0.050*"american" + 0.034*"risk" + 0.021*"opportun" + 0.019*"approach" + 0.018*"countri" + 0.016*"relat" + 0.014*"signific" + 0.014*"rat" + 0.014*"higher" + 0.013*"mayb"
Topic: 4 
Words: 0.054*"guid" + 0.043*"father" + 0.040*"obama" + 0.035*"bride" + 0.019*"middleton" + 0.017*"british" + 0.016*"bear" + 0.016*"giant" + 0.015*"royal" + 0.014*"coast"
Topic: 5 
Words: 0.043*"game" + 0.040*"super" + 0.040*"season" + 0.035*"bowl" + 0.034*"natur" + 0.032*"pl

In [50]:
processed_docs[4310]

['impact', 'number', 'handi', 'intern', 'women', 'face', 'inevit']

In [51]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4780633747577667	 
Topic: 0.065*"busi" + 0.023*"social" + 0.021*"compani" + 0.021*"tech" + 0.018*"bank" + 0.015*"collect" + 0.014*"memori" + 0.014*"impact" + 0.013*"network" + 0.012*"develop"

Score: 0.25011584162712097	 
Topic: 0.102*"women" + 0.046*"girl" + 0.024*"medit" + 0.020*"shoe" + 0.020*"note" + 0.019*"woman" + 0.018*"outfit" + 0.016*"interest" + 0.015*"march" + 0.014*"highlight"

Score: 0.15618090331554413	 
Topic: 0.029*"blue" + 0.027*"requir" + 0.023*"buy" + 0.018*"economi" + 0.018*"paper" + 0.016*"print" + 0.015*"januari" + 0.014*"jump" + 0.014*"meat" + 0.013*"task"


In [52]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4782792329788208	 
Topic: 0.065*"busi" + 0.023*"social" + 0.021*"compani" + 0.021*"tech" + 0.018*"bank" + 0.015*"collect" + 0.014*"memori" + 0.014*"impact" + 0.013*"network" + 0.012*"develop"

Score: 0.24996843934059143	 
Topic: 0.102*"women" + 0.046*"girl" + 0.024*"medit" + 0.020*"shoe" + 0.020*"note" + 0.019*"woman" + 0.018*"outfit" + 0.016*"interest" + 0.015*"march" + 0.014*"highlight"

Score: 0.1561124622821808	 
Topic: 0.029*"blue" + 0.027*"requir" + 0.023*"buy" + 0.018*"economi" + 0.018*"paper" + 0.016*"print" + 0.015*"januari" + 0.014*"jump" + 0.014*"meat" + 0.013*"task"


In [54]:
unseen_document = 'Donald Trump is a loser'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5012917518615723	 Topic: 0.038*"wall" + 0.035*"street" + 0.035*"presid" + 0.032*"fit" + 0.030*"addict"
Score: 0.26119643449783325	 Topic: 0.038*"manag" + 0.032*"card" + 0.026*"cocktail" + 0.024*"spirit" + 0.023*"thanksgiv"
