<a href="https://colab.research.google.com/github/anshupandey/xebia_training_data/blob/main/Case_Study_Topic_modelling_using_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation and Setup

In [None]:
# install below packages if not available
!pip install gensim
!pip install spaCy



In [None]:
# download the corpus for nltk and spacy's english language model
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
!python -m spacy download en

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 2.8MB/s 
Collecting funcy
[?25l  Downloading https://files.pythonhosted.org/packages/ce/4b/6ffa76544e46614123de31574ad95758c421aae391a1764921b8a81e1eae/funcy-1.14.tar.gz (548kB)
[K     |████████████████████████████████| 552kB 48.7MB/s 
Building wheels for collected packages: pyLDAvis, funcy
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97711 sha256=637c5bddd0921c8cab1ee7e859a81c8b670094853bf0db671e5e816d4cab9b8a
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
  Building wheel for funcy (setup.py) ... [?25l[?25hdone
  Created wheel for funcy: filename=funcy-1.14-py2.py3-none-any.whl size=32042 sha256=fcdcdde5

# importing packages

In [None]:
import re
import numpy as np
import pandas as pd

# GENSIM
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import spacy

# importing data

In [None]:
# importing newsgroup data
from sklearn import datasets
emaildata = datasets.fetch_20newsgroups()
data = emaildata.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
len(data)

11314

In [None]:
print(data[11000])

From: yamauchi@ces.cwru.edu (Brian Yamauchi)
Subject: DC-X: Choice of a New Generation (was Re: SSRT Roll-Out Speech)
Organization: Case Western Reserve University
Lines: 27
Distribution: world
NNTP-Posting-Host: yuggoth.ces.cwru.edu
In-reply-to: jkatz@access.digex.com's message of 21 Apr 1993 22:09:32 -0400

In article <1r4uos$jid@access.digex.net> jkatz@access.digex.com (Jordan Katz) writes:

>		   Speech Delivered by Col. Simon P. Worden,
>			The Deputy for Technology, SDIO
>
>	Most of you, as am I, are "children of the 1960's."  We grew
>up in an age of miracles -- Inter-Continental Ballistic Missiles,
>nuclear energy, computers, flights to the moon.  But these were
>miracles of our parent's doing. 

>                          Speech by Pete Worden
>          Delivered Before the U.S. Space Foundation Conference

>     I'm embarrassed when my generation is compared with the last
>generation -- the giants of the last great space era, the 1950's
>and 1960's.  They went to the moon - 

# Data Cleaning

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

## Tokenize words and Clean-up text

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


# Creating Bigram and Trigram Models

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


# Remove Stopwords, Make Bigrams and Lemmatize

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['where', 's', 'thing', 'car', 'nntp_poste', 'host', 'umd', 'organization', 'university', 'maryland_college', 'park', 'line', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


# Topic Modeling

In [None]:
# Create Dictionary
dictionary = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 5), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1)]]


In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# View the topics in LDA model

In [None]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.036*"team" + 0.036*"game" + 0.020*"sale" + 0.019*"play" + 0.015*"hockey" + 0.012*"year" + 0.008*"nhl" + 0.008*"trade" + 0.008*"wing" + 0.007*"steven"'), (1, '0.040*"book" + 0.020*"belief" + 0.020*"atheist" + 0.018*"church" + 0.015*"pin" + 0.015*"slave" + 0.014*"sphere" + 0.012*"character" + 0.010*"lord" + 0.009*"headache"'), (2, '0.015*"choose" + 0.012*"input" + 0.011*"sin" + 0.010*"notice" + 0.009*"eat" + 0.009*"cd" + 0.009*"food" + 0.009*"material" + 0.008*"signal" + 0.008*"external"'), (3, '0.035*"not" + 0.022*"write" + 0.022*"do" + 0.020*"would" + 0.020*"line" + 0.019*"organization" + 0.017*"be" + 0.017*"article" + 0.014*"get" + 0.014*"know"'), (4, '0.533*"ax" + 0.008*"rlk" + 0.006*"cub" + 0.005*"echo" + 0.004*"tufts_university" + 0.004*"stl" + 0.004*"pitcher" + 0.004*"pit" + 0.004*"lk" + 0.003*"differential"'), (5, '0.027*"israel" + 0.015*"israeli" + 0.011*"jew" + 0.009*"lebanese" + 0.009*"arab" + 0.009*"jewish" + 0.009*"war" + 0.008*"death" + 0.008*"kill" + 0.007*"attack"

In [None]:
lda_model.print_topics()

[(0,
  '0.036*"team" + 0.036*"game" + 0.020*"sale" + 0.019*"play" + 0.015*"hockey" + 0.012*"year" + 0.008*"nhl" + 0.008*"trade" + 0.008*"wing" + 0.007*"steven"'),
 (1,
  '0.040*"book" + 0.020*"belief" + 0.020*"atheist" + 0.018*"church" + 0.015*"pin" + 0.015*"slave" + 0.014*"sphere" + 0.012*"character" + 0.010*"lord" + 0.009*"headache"'),
 (2,
  '0.015*"choose" + 0.012*"input" + 0.011*"sin" + 0.010*"notice" + 0.009*"eat" + 0.009*"cd" + 0.009*"food" + 0.009*"material" + 0.008*"signal" + 0.008*"external"'),
 (3,
  '0.035*"not" + 0.022*"write" + 0.022*"do" + 0.020*"would" + 0.020*"line" + 0.019*"organization" + 0.017*"be" + 0.017*"article" + 0.014*"get" + 0.014*"know"'),
 (4,
  '0.533*"ax" + 0.008*"rlk" + 0.006*"cub" + 0.005*"echo" + 0.004*"tufts_university" + 0.004*"stl" + 0.004*"pitcher" + 0.004*"pit" + 0.004*"lk" + 0.003*"differential"'),
 (5,
  '0.027*"israel" + 0.015*"israeli" + 0.011*"jew" + 0.009*"lebanese" + 0.009*"arab" + 0.009*"jewish" + 0.009*"war" + 0.008*"death" + 0.008*"kill"

# Compute Model Coherence Score

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5017680246997409


# Visualize the topics-keywords

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

# LDA Mallet Model

You can achive better performance with LDA Mallet model

In [None]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = r"D:\AI\jpmc_mumbai\mallet-2.0.8\mallet-2.0.8\bin\mallet" # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=dictionary)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\anshu\\AppData\\Local\\Temp\\93da81_state.mallet.gz'

In [None]:
# Show Topics
print(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

# finding the optimal number of topics for LDA?

Choosing a ‘k’ that marks the end of a rapid growth of topic coherence usually offers meaningful and interpretable topics. Picking an even higher value can sometimes provide more granular sub-topics.

If you see the same keywords being repeated in multiple topics, it’s probably a sign that the ‘k’ is too large.

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()