<a href="https://colab.research.google.com/github/andrybrew/pythondataanalytics/blob/master/005_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modeling

In machine learning and natural language processing, a topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body.

Install Library

In [0]:
# Install pyLDAvis
! pip install pyLDAvis

Import Libraries and Modules

In [0]:
# Import Libraries
import nltk
import os
import numpy as np, pyLDAvis, pyLDAvis.sklearn; pyLDAvis.enable_notebook()

# Import Modules
from __future__ import print_function 
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from matplotlib import pyplot as plt

Import Library and Data from Github Directory

In [0]:
# Clone Data from Github
! git clone https://github.com/dianrdn/tm

# Set Data Directory
os.chdir('tm')

In [0]:
# Import Data
nltk.download('stopwords')
data_file = 'berita_batubara.csv'
n_topics, Top_Topics, Top_Words = 4, 5, 5 # Depends on the purpose of analytics
max_df, min_df = 0.75, 10 # Can be adjusted

In [0]:
# Import Library
import MyLib as TS

Topic Modeling

In [0]:
Tweets = TS.LoadTxt(data_file) 
print('Total loaded tweets = {0}'.format(len(Tweets)))
print(Tweets[0])

In [0]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',stop_words = 'english', lowercase = True, token_pattern = r'\b[a-zA-Z]{3,}\b',max_df = max_df, min_df = min_df)
dtm_tf = tf_vectorizer.fit_transform(Tweets)
tf_terms = tf_vectorizer.get_feature_names()
del Tweets
print('Done Calculating VSM ... ', flush = True)

In [0]:
# LDA Topics
lda_tf = LatentDirichletAllocation(n_components=n_topics, learning_method='online', random_state=0).fit(dtm_tf)
print('Done LDA topics ... ', flush = True) 

In [0]:
vsm_topics = lda_tf.transform(dtm_tf); doc_topic =  [a.argmax()+1 for a in tqdm(vsm_topics)] # topic of docs
print('In total there are {0} major topics, distributed as follows'.format(len(set(doc_topic))))
plt.hist(np.array(doc_topic), alpha=0.5); plt.show()
print('Printing top {0} Topics, with top {1} Words:'.format(Top_Topics, Top_Words))
TS.print_Topics(lda_tf, tf_terms, Top_Topics, Top_Words)

In [0]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer) # Interactively visualizing the Topics, please ignore the Warnings
# Wait few minutes and then hover the Mouse over the Topics to Explore