# **Topic Modeling and Latent Dirichlet Allocation in Python**


In this article we disscuss topic modeling of twitter data using Latent Dirichlet Allocation algorithm.

In [5]:


import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text, stem=True):
    result = lemmatizer.lemmatize(text, pos='v')
    if stem:
        result = stemmer.stem(result)
    return result
    
def preprocess(text):
    
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and len(token) < 40:
            result.append(lemmatize_stemming(token, stem=False))
    return " ".join(result)


In [7]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = [preprocess(d) for d in dataset.data]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])


no_features = 1000

# Data cleaning

import re
from nltk.corpus import stopwords


# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=15, learning_method='online', learning_decay=.9, learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
#display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
space launch nasa satellite program earth orbit science data project
Topic 1:
world jews greek value school jewish like come event learn
Topic 2:
price sell offer power include like ship sale good port
Topic 3:
stephanopoulos health study medical center research report disease patients doctor
Topic 4:
team play game league hockey score players season period player
Topic 5:
file program windows window version available include server source write
Topic 6:
card problem scsi work windows speed memory know drive video
Topic 7:
year think like time start good mark know appear better
Topic 8:
game stuff delete bank trade soon blue probably cross season
Topic 9:
time israel bike right like israeli water attack know grind
Topic 10:
chip encryption government key clipper security phone public technology privacy
Topic 11:
leave home start hear look face turn stop little tire
Topic 12:
people think right know want work like time state need
Topic 13:
light black engine white john hole rad

In [178]:
import numpy as np
np.unique(dataset.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [179]:
y_pred_lda = lda.transform(tf)

In [180]:
y_pred_lda_top = [i.argsort()[0] for i in y_pred_lda]

In [181]:
dataset.target

array([17,  0, 17, ...,  9,  4,  9])

In [182]:
y_pred_nmf = nmf.transform(tfidf)

In [183]:
y_pred_nmf_top = [i.argsort()[0] for i in y_pred_nmf]

In [184]:
import pandas as pd
y_pred_nmf_comp = pd.DataFrame(np.column_stack([dataset.target, y_pred_nmf_top]), columns=['y_true', 'y_pred'])
y_pred_lda_comp = pd.DataFrame(np.column_stack([dataset.target, y_pred_lda_top]), columns=['y_true', 'y_pred'])

In [185]:
y_pred_nmf_comp.groupby('y_true').std().mean()

y_pred    5.031132
dtype: float64

In [4]:
y_pred_lda_comp.groupby('y_true').std()

NameError: name 'y_pred_lda_comp' is not defined

In [120]:
y_pred_rand_comp = pd.DataFrame(np.column_stack([dataset.target, np.random.randint(20, size=dataset.target.shape)]), columns=['y_true', 'y_pred'])

In [3]:
y_pred_rand_comp.groupby('y_true').std().mean()

NameError: name 'y_pred_rand_comp' is not defined

In [77]:
np.random.randint(2, size=(10,))

array([0, 1, 0, 0, 0, 1, 0, 1, 1, 1])

In [188]:
y_pred_lda_comp['y_pred'].value_counts()

13    1727
8     1543
3     1348
4     1217
0     1145
14    1029
15     655
19     601
11     404
10     392
18     299
16     220
1      212
2      162
5      101
7       91
6       90
17      45
9       32
12       1
Name: y_pred, dtype: int64

In [2]:
y_pred_lda_comp['y_true'].value_counts()

NameError: name 'y_pred_lda_comp' is not defined

In [1]:
y_pred_lda_comp['y_pred']

NameError: name 'y_pred_lda_comp' is not defined