# Topic Modeling (Using gensim)


In [5]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

from pprint import pprint

import spacy

import pickle
import re
import pyLDAvis
import pyLDAvis.gensim

import matplotlib.pyplot as plt
import pandas as pd
import nltk
import numpy as np

## Data Preparation

In [6]:
corpus = [
    'The sky is blue and beautiful.', 'Love this blue and beautiful sky!',
    'The quick brown fox jumps over the lazy dog.',
    "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
    'I love green eggs, ham, sausages and bacon!',
    'The brown fox is quick and the blue dog is lazy!',
    'The sky is very blue and the sky is very beautiful today',
    'The dog is lazy but the brown fox is quick!'
]
labels = [
    'weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather',
    'animals'
]

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')


def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokeanize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc


normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'kings breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today',
       'dog lazy brown fox quick'], dtype='<U51')

In [7]:
norm_corpus_tokens = [text.split(" ") for text in norm_corpus]
id2word = Dictionary(norm_corpus_tokens)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in norm_corpus_tokens]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1)]]


In [9]:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

[[('beautiful', 1), ('blue', 1), ('sky', 1)],
 [('beautiful', 1), ('blue', 1), ('sky', 1), ('love', 1)],
 [('brown', 1),
  ('dog', 1),
  ('fox', 1),
  ('jumps', 1),
  ('lazy', 1),
  ('quick', 1)]]

## LDA Model Building

In [10]:
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10,
                   random_state=0,
                   chunksize=100,
                   alpha='auto',
                   per_word_topics=True)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.058*"ham" + 0.058*"sausages" + 0.058*"bacon" + 0.058*"eggs" + '
  '0.058*"toast" + 0.058*"fox" + 0.058*"lazy" + 0.058*"dog" + 0.058*"brown" + '
  '0.058*"quick"'),
 (1,
  '0.050*"blue" + 0.050*"beautiful" + 0.050*"sky" + 0.050*"quick" + '
  '0.050*"brown" + 0.050*"dog" + 0.050*"lazy" + 0.050*"fox" + 0.050*"love" + '
  '0.050*"eggs"'),
 (2,
  '0.050*"sky" + 0.050*"blue" + 0.050*"beautiful" + 0.050*"lazy" + 0.050*"dog" '
  '+ 0.050*"fox" + 0.050*"quick" + 0.050*"brown" + 0.050*"love" + '
  '0.050*"eggs"'),
 (3,
  '0.124*"dog" + 0.124*"quick" + 0.124*"fox" + 0.124*"brown" + 0.124*"blue" + '
  '0.124*"lazy" + 0.065*"beautiful" + 0.065*"love" + 0.065*"sky" + '
  '0.006*"sausages"'),
 (4,
  '0.050*"blue" + 0.050*"sky" + 0.050*"quick" + 0.050*"beautiful" + '
  '0.050*"love" + 0.050*"brown" + 0.050*"lazy" + 0.050*"dog" + 0.050*"fox" + '
  '0.050*"ham"'),
 (5,
  '0.050*"sky" + 0.050*"blue" + 0.050*"beautiful" + 0.050*"lazy" + 0.050*"dog" '
  '+ 0.050*"quick" + 0.050*"brown" + 0.050*"f

In [11]:
# Print the Keyword
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.058*"ham" + 0.058*"sausages" + 0.058*"bacon" + 0.058*"eggs" + '
  '0.058*"toast" + 0.058*"fox" + 0.058*"lazy" + 0.058*"dog" + 0.058*"brown" + '
  '0.058*"quick"'),
 (1,
  '0.050*"blue" + 0.050*"beautiful" + 0.050*"sky" + 0.050*"quick" + '
  '0.050*"brown" + 0.050*"dog" + 0.050*"lazy" + 0.050*"fox" + 0.050*"love" + '
  '0.050*"eggs"'),
 (2,
  '0.050*"sky" + 0.050*"blue" + 0.050*"beautiful" + 0.050*"lazy" + 0.050*"dog" '
  '+ 0.050*"fox" + 0.050*"quick" + 0.050*"brown" + 0.050*"love" + '
  '0.050*"eggs"'),
 (3,
  '0.124*"dog" + 0.124*"quick" + 0.124*"fox" + 0.124*"brown" + 0.124*"blue" + '
  '0.124*"lazy" + 0.065*"beautiful" + 0.065*"love" + 0.065*"sky" + '
  '0.006*"sausages"'),
 (4,
  '0.050*"blue" + 0.050*"sky" + 0.050*"quick" + 0.050*"beautiful" + '
  '0.050*"love" + 0.050*"brown" + 0.050*"lazy" + 0.050*"dog" + 0.050*"fox" + '
  '0.050*"ham"'),
 (5,
  '0.050*"sky" + 0.050*"blue" + 0.050*"beautiful" + 0.050*"lazy" + 0.050*"dog" '
  '+ 0.050*"quick" + 0.050*"brown" + 0.050*"f

## Compute the Coherence Scores

In [22]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=norm_corpus_tokens, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.38707259196030064


## Visualization

In [14]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

## Another Dataset

In [15]:
dset_url='https://archive.org/download/misc-dataset/airline-tweets-ngram.csv'
airline_df = pd.read_csv(dset_url) #Change this with the name of your downloaded file

airline_df['grams'] = airline_df['gram1_word_gt_1'].astype('string') + ','+airline_df['gram2_word_gt_1'].astype('string')+','+airline_df['gram3_word_gt_1'].astype('string')
# replace one or more commas with a single space character
airline_df['grams'] = airline_df['grams'].astype('string').replace(r'[,\s]+', r' ', regex=True) 
airline_df.dropna(inplace=True)

airline_grams =[s.split() for s in airline_df.grams.values.tolist()]

print(airline_grams[0:1])

[['seriously', 'would', 'pay', 'flight', 'seat', 'play', 'really', 'bad', 'thing', 'fly', 'va', 'would_pay', 'pay_flight', 'flight_seat', 'seat_not', 'not_play', 'really_bad', 'bad_thing', 'thing_fly', 'fly_va', 'bad_thing_fly']]


In [16]:
# create dictionary
airline_id2word = Dictionary(airline_grams)

# create corpus
airline_corpus = [airline_id2word.doc2bow(text) for text in airline_grams]

# test
print(airline_corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)]]


In [17]:
# test by checking the id and its corresponding word value and frequency
[[(airline_id2word[i], freq) for i, freq in doc] for doc in airline_corpus[0:1]]

[[('bad', 1),
  ('bad_thing', 1),
  ('bad_thing_fly', 1),
  ('flight', 1),
  ('flight_seat', 1),
  ('fly', 1),
  ('fly_va', 1),
  ('not_play', 1),
  ('pay', 1),
  ('pay_flight', 1),
  ('play', 1),
  ('really', 1),
  ('really_bad', 1),
  ('seat', 1),
  ('seat_not', 1),
  ('seriously', 1),
  ('thing', 1),
  ('thing_fly', 1),
  ('va', 1),
  ('would', 1),
  ('would_pay', 1)]]

In [18]:
# Build LDA model
airline_lda_model = LdaModel(corpus=airline_corpus,
                   id2word=airline_id2word,
                   num_topics=10, 
                   random_state=100,
                   update_every=1,
                   chunksize=100,
                   alpha='auto',
                   per_word_topics=True)

In [19]:
# Print the Keyword
pprint(airline_lda_model.print_topics())
doc_lda = airline_lda_model[airline_corpus]

[(0,
  '0.058*"call" + 0.034*"cannot" + 0.033*"get" + 0.025*"help" + 0.023*"flight" '
  '+ 0.023*"phone" + 0.021*"still" + 0.018*"make" + 0.018*"wait" + '
  '0.015*"call_back"'),
 (1,
  '0.166*"flight" + 0.077*"cancel" + 0.043*"flightled" + '
  '0.042*"cancel_flightled" + 0.025*"tomorrow" + 0.022*"cancel_flight" + '
  '0.019*"dfw" + 0.015*"need" + 0.014*"miss" + 0.014*"home"'),
 (2,
  '0.053*"get" + 0.044*"back" + 0.031*"flight_cancel" + 0.028*"work" + '
  '0.024*"us" + 0.024*"flight_cancel_flightled" + 0.022*"thank" + 0.022*"you" '
  '+ 0.019*"know" + 0.018*"let"'),
 (3,
  '0.070*"hold" + 0.037*"weather" + 0.030*"please" + 0.023*"email" + '
  '0.022*"hour" + 0.021*"response" + 0.015*"rebook" + 0.015*"info" + '
  '0.014*"do" + 0.013*"dm"'),
 (4,
  '0.035*"hour" + 0.028*"delay" + 0.027*"one" + 0.023*"time" + 0.021*"bad" + '
  '0.021*"even" + 0.016*"ever" + 0.015*"line" + 0.014*"would" + 0.014*"me"'),
 (5,
  '0.037*"gate" + 0.033*"change" + 0.027*"late" + 0.026*"agent" + '
  '0.023*"leav

In [20]:
# Compute Coherence Score
airline_coherence_model_lda = CoherenceModel(model=airline_lda_model, texts=airline_grams, dictionary=airline_id2word, coherence='c_v')
airline_coherence_lda = airline_coherence_model_lda.get_coherence()
print('\nCoherence Score: ', airline_coherence_lda)
# output:
# Coherence Score:  0.3016957987636324


Coherence Score:  0.3016957987636324


In [21]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(airline_lda_model, airline_corpus, airline_id2word)