In [1]:
import pandas as pd
import numpy as np
import pickle
from pprint import pprint
import matplotlib.pyplot as plt
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import HdpModel
from gensim.test.utils import datapath
import logging
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

# load data
dictionary = Dictionary.load("data/dictionary.pkl")
temp_file = open("data/train.pkl", "rb")
train = pickle.load(temp_file)
temp_file.close()
train_reviews = train["tokens"].tolist()
train_corpus = MmCorpus("data/train_corpus.mm")
test_corpus = MmCorpus("data/test_corpus.mm")

2021-05-27 16:35:31,894 : INFO : loading Dictionary object from data/dictionary.pkl
2021-05-27 16:35:31,907 : INFO : Dictionary lifecycle event {'fname': 'data/dictionary.pkl', 'datetime': '2021-05-27T16:35:31.907068', 'gensim': '4.0.1', 'python': '3.9.5 (default, May  4 2021, 03:36:27) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-11.4-x86_64-i386-64bit', 'event': 'loaded'}
2021-05-27 16:35:33,185 : INFO : loaded corpus index from data/train_corpus.mm.index
2021-05-27 16:35:33,186 : INFO : initializing cython corpus reader from data/train_corpus.mm
2021-05-27 16:35:33,187 : INFO : accepted corpus with 82443 documents, 17978 features, 5286179 non-zero entries
2021-05-27 16:35:33,189 : INFO : loaded corpus index from data/test_corpus.mm.index
2021-05-27 16:35:33,190 : INFO : initializing cython corpus reader from data/test_corpus.mm
2021-05-27 16:35:33,191 : INFO : accepted corpus with 1665 documents, 17974 features, 103674 non-zero entries


In [2]:
# parameters
chunksize = 5000
max_chunks = 32  # force a 2nd pass
# alpha (2nd level concentration) 1
# gamma (first level concentration) 1 
# eta default symmetric (topic dirichlet) .01
# decay/offset at defaults 1/64 (kappa/tau in online hdp paper)

# load dictionary
temp = dictionary[0]
id2word = dictionary.id2token

# train model
model = HdpModel(corpus=train_corpus,
                 id2word=id2word,
                 chunksize=chunksize,
                 max_chunks=max_chunks,
                 random_state=271)

model.save(datapath("hdp"))

2021-05-27 16:35:53,173 : INFO : (0, '0.004*character + 0.004*like + 0.003*really + 0.003*time + 0.003*love + 0.002*series + 0.002*would + 0.002*get + 0.002*first + 0.002*much')
2021-05-27 16:35:53,190 : INFO : (1, '0.001*character + 0.001*like + 0.001*life + 0.001*really + 0.001*first + 0.001*thing + 0.001*good + 0.001*much + 0.001*time + 0.001*would')
2021-05-27 16:35:53,205 : INFO : (2, '0.001*character + 0.001*like + 0.001*get + 0.001*really + 0.001*time + 0.001*much + 0.001*love + 0.001*could + 0.001*want + 0.001*well')
2021-05-27 16:35:53,218 : INFO : (3, '0.001*character + 0.001*like + 0.001*love + 0.001*really + 0.001*series + 0.001*would + 0.001*much + 0.001*first + 0.001*know + 0.001*reading')
2021-05-27 16:35:53,231 : INFO : (4, '0.001*character + 0.001*like + 0.001*though + 0.001*really + 0.001*time + 0.001*love + 0.001*much + 0.001*would + 0.001*interesting + 0.001*could')
2021-05-27 16:35:53,244 : INFO : (5, '0.001*like + 0.001*character + 0.001*really + 0.001*get + 0.001

In [3]:
# top topics
model.print_topics(50)

2021-05-27 16:43:48,129 : INFO : (0, '0.009*character + 0.008*like + 0.006*really + 0.006*time + 0.006*love + 0.005*would + 0.005*series + 0.005*get + 0.005*much + 0.005*first')
2021-05-27 16:43:48,139 : INFO : (1, '0.007*character + 0.006*like + 0.005*time + 0.005*really + 0.005*would + 0.004*love + 0.004*series + 0.004*first + 0.004*much + 0.004*life')
2021-05-27 16:43:48,150 : INFO : (2, '0.008*character + 0.006*like + 0.006*really + 0.005*series + 0.005*time + 0.005*love + 0.004*would + 0.004*good + 0.004*first + 0.004*much')
2021-05-27 16:43:48,160 : INFO : (3, '0.005*character + 0.004*like + 0.004*really + 0.003*series + 0.003*love + 0.003*time + 0.003*good + 0.002*would + 0.002*first + 0.002*much')
2021-05-27 16:43:48,171 : INFO : (4, '0.004*character + 0.003*like + 0.003*really + 0.002*time + 0.002*series + 0.002*love + 0.002*good + 0.002*much + 0.002*would + 0.002*reading')
2021-05-27 16:43:48,182 : INFO : (5, '0.002*character + 0.002*like + 0.002*really + 0.002*series + 0.001

[(0,
  '0.009*character + 0.008*like + 0.006*really + 0.006*time + 0.006*love + 0.005*would + 0.005*series + 0.005*get + 0.005*much + 0.005*first'),
 (1,
  '0.007*character + 0.006*like + 0.005*time + 0.005*really + 0.005*would + 0.004*love + 0.004*series + 0.004*first + 0.004*much + 0.004*life'),
 (2,
  '0.008*character + 0.006*like + 0.006*really + 0.005*series + 0.005*time + 0.005*love + 0.004*would + 0.004*good + 0.004*first + 0.004*much'),
 (3,
  '0.005*character + 0.004*like + 0.004*really + 0.003*series + 0.003*love + 0.003*time + 0.003*good + 0.002*would + 0.002*first + 0.002*much'),
 (4,
  '0.004*character + 0.003*like + 0.003*really + 0.002*time + 0.002*series + 0.002*love + 0.002*good + 0.002*much + 0.002*would + 0.002*reading'),
 (5,
  '0.002*character + 0.002*like + 0.002*really + 0.002*series + 0.001*good + 0.001*love + 0.001*time + 0.001*would + 0.001*get + 0.001*much'),
 (6,
  '0.002*like + 0.002*character + 0.001*really + 0.001*series + 0.001*time + 0.001*love + 0.001*

In [4]:
# must convert to closest LDA for topic coherence
ldamodel = model.suggested_lda_model()

# topic coherence
print("Topic Coherence")
top_topics = ldamodel.top_topics(texts=train_reviews, dictionary=dictionary, coherence="c_v")
avg_topic_coherence = sum([t[1] for t in top_topics]) / len(top_topics)
print(avg_topic_coherence)
print("Topic Top Words")
pprint(top_topics)

2021-05-27 16:43:48,685 : INFO : using symmetric eta at 0.006666666666666667
2021-05-27 16:43:48,688 : INFO : using serial LDA version on this node
2021-05-27 16:43:48,907 : INFO : using ParallelWordOccurrenceAccumulator(processes=7, batch_size=64) to estimate probabilities from sliding windows


Topic Coherence


2021-05-27 16:43:52,670 : INFO : 11 batches submitted to accumulate stats from 704 documents (-15205 virtual)
2021-05-27 16:43:53,417 : INFO : 36 batches submitted to accumulate stats from 2304 documents (-43691 virtual)
2021-05-27 16:43:54,002 : INFO : 55 batches submitted to accumulate stats from 3520 documents (-69160 virtual)
2021-05-27 16:43:55,306 : INFO : 94 batches submitted to accumulate stats from 6016 documents (-126419 virtual)
2021-05-27 16:43:55,894 : INFO : 113 batches submitted to accumulate stats from 7232 documents (-155406 virtual)
2021-05-27 16:43:56,638 : INFO : 138 batches submitted to accumulate stats from 8832 documents (-191651 virtual)
2021-05-27 16:43:57,235 : INFO : 155 batches submitted to accumulate stats from 9920 documents (-213470 virtual)
2021-05-27 16:43:57,537 : INFO : 165 batches submitted to accumulate stats from 10560 documents (-229325 virtual)
2021-05-27 16:43:57,641 : INFO : 170 batches submitted to accumulate stats from 10880 documents (-23396

0.6368722954815487
Topic Top Words
[([(7.801161150757461e-05, 'diligently'),
   (7.703946746095791e-05, 'rescuer'),
   (7.630659918964916e-05, 'croft'),
   (7.627046922293335e-05, 'kaden'),
   (7.552926268794547e-05, 'unhealthy'),
   (7.484306301581785e-05, 'pours'),
   (7.48356502299699e-05, 'robin_hobb'),
   (7.479076128793252e-05, 'af'),
   (7.459412034739048e-05, 'whatnot'),
   (7.439321056041964e-05, 'leading_lady'),
   (7.436168577711504e-05, 'adept'),
   (7.424134372721199e-05, 'promote'),
   (7.422945612042655e-05, 'specializes'),
   (7.420813411892975e-05, 'fateful_day'),
   (7.416083965828669e-05, 'lyric'),
   (7.409385725680654e-05, 'precursor'),
   (7.397924667052928e-05, 'blaze'),
   (7.379260042038553e-05, 'proof'),
   (7.368796188228128e-05, 'rhythm'),
   (7.366610016459545e-05, 'nixon')],
  0.7378498636729951),
 ([(8.03044730559989e-05, 'cater'),
   (8.00257066535848e-05, 'rusty'),
   (7.836907364012753e-05, 'eldest'),
   (7.780499691741906e-05, 'dripping'),
   (7.73049