In [None]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.lda_model
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

RANDOM_SEED = 42

## Step 1: load data

In [None]:
# enter the path for the cloned repo 
directory = ''

# get corpus
fp = f'{directory}/data/processed/topic_modeling_corpus.pkl'
docs = pd.read_pickle(fp)

# prepare train and test set
train_data, test_data = train_test_split(docs, test_size=0.2, random_state=RANDOM_SEED)

## Step 2: modeling with LDA

In [26]:
# vectorize
tf_vectorizer = CountVectorizer(#preprocessor=custom_preprocessor,
                                       #max_features = 20000, # only top 10k by freq
                                       lowercase = True, # keep capitalization
                                       ngram_range = (2,3), # include 2-word phrases
                                       min_df=1,  # note: absolute count of doc
                                       max_df=0.90,   # note: % of docs
                                       stop_words='english') # default English stopwords

tf_doc_train = tf_vectorizer.fit_transform(train_data)
tf_doc_test = tf_vectorizer.transform(test_data)
tf_feature_names = tf_vectorizer.get_feature_names_out()

# train LDA
n_topics = 4
lda = LatentDirichletAllocation(n_components=n_topics, random_state=RANDOM_SEED)
lda.fit(tf_doc_train)
topic_models = lda.components_

# scoring 
score = lda.score(tf_doc_test)
perplexity = lda.perplexity(tf_doc_test)
print(f'{score = }')
print(f'{perplexity = }')

26 null records dropped out of 3895
score = -23528.688577070636
perplexity = 1375118646231556.5


In [27]:
# visualize and save report 
pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(lda, tf_doc_train, tf_vectorizer, mds='tsne')
pyLDAvis.save_html(panel, f'{directory}/reports/lda_{n_topics}.html')