In [7]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import defaultdict

from hlda_utils import *
from hlda_final import HLDA_Node, HierarchicalLDA

In [8]:
# Fetch the newsgroups dataset (train subset)
newsgroups_data = fetch_20newsgroups(
    subset='train', 
    remove=('headers', 'footers', 'quotes')
)

# Select only the first 2000 documents
raw_docs = newsgroups_data.data[:2000]  # Slice to get only the first 2000 docs
labels = newsgroups_data.target[:2000]  # Slice to match the labels
target_names = newsgroups_data.target_names

# Print dataset information
print(f"Number of documents: {len(raw_docs)}")
print(f"Number of categories: {len(target_names)}")

Number of documents: 2000
Number of categories: 20


In [9]:
# Apply preprocessing pipeline
filtered_docs, filtered_labels, vocab, word2idx, idx2word, corpus = full_preprocessing_pipeline(
    raw_docs, 
    labels, 
    stop_words=None, 
    stemmer=PorterStemmer(), 
    lemmatizer=WordNetLemmatizer(),
    min_word_length=2, 
    min_freq=5
)

# Checking document lengths after filtering
doc_lengths = [len(doc) for doc in filtered_docs]
print("")
print(f"After filtering docs:")
print(f"  Avg length: {np.mean(doc_lengths):.2f} tokens")
print(f"  Median length: {np.median(doc_lengths)}")
print(f"  Min length: {np.min(doc_lengths)}")
print(f"  Max length: {np.max(doc_lengths)}")

Number of documents after filtering empty ones: 1940
Number of labels after filtering: 1940

Vocabulary size (words with freq >= 5): 4805
Sample vocabulary words: ['aa', 'aaa', 'ab', 'abbrevi', 'abc', 'abil', 'abl', 'abolish', 'abort', 'abraham']
Preprecessing done

After filtering docs:
  Avg length: 90.41 tokens
  Median length: 42.0
  Min length: 1
  Max length: 4780


In [10]:
hlda_model5 = HierarchicalLDA(
    corpus=corpus,
    vocabulary=vocab,
    gamma = 0.001,
    eta = 0.01,
    alpha = 10,
    levels= 3
)

hlda_model5.gibbs_sampling(iterations=2000,
                          topic_display_interval=2000, 
                          top_n_words=8, 
                          show_word_counts=True)

Starting Hierarchical LDA sampling

10% done
20% done
30% done
40% done
50% done
60% done
70% done
80% done
90% done
100% done
*********************The 1 result**************************
topic=0 level=0 (docs=1940): also (461), first (295), mani (293), two (285), even (254), may (238), call (224), part (213)
    topic=1 level=1 (docs=412): would (262), one (244), use (222), get (193), like (189), know (181), think (136), time (134)
        topic=2 level=2 (docs=19): armenian (115), said (86), peopl (83), u (81), say (72), woman (52), one (51), child (49)
        topic=3 level=2 (docs=3): period (42), pp (37), play (34), power (33), scorer (24), pt (23), philadelphia (15), calgari (15)
        topic=4 level=2 (docs=3): mv (58), ah (49), sq (35), q (32), zv (31), hz (30), ri (29), xte (24)
        topic=6 level=2 (docs=16): avail (83), widget (59), includ (54), version (54), support (45), server (44), motif (39), sun (38)
        topic=13 level=2 (docs=2): father (44), son (35), spirit (

In [11]:
hlda_model6 = HierarchicalLDA(
    corpus=corpus,
    vocabulary=vocab,
    gamma = 0.05,
    eta = 0.01,
    alpha = 20,
    levels= 3
)

hlda_model6.gibbs_sampling(iterations=2000,
                          topic_display_interval=2000, 
                          top_n_words=8, 
                          show_word_counts=True)

Starting Hierarchical LDA sampling

10% done
20% done
30% done
40% done
50% done
60% done
70% done
80% done
90% done
100% done
*********************The 1 result**************************
topic=0 level=0 (docs=1940): mani (285), year (273), first (258), take (249), call (244), sure (209), differ (205), ask (203)
    topic=1 level=1 (docs=487): would (323), use (318), one (281), like (228), get (215), know (182), also (146), say (143)
        topic=2 level=2 (docs=15): armenian (115), peopl (92), said (83), u (72), say (57), one (57), start (52), child (49)
        topic=3 level=2 (docs=6): period (42), pp (37), play (37), power (32), pt (27), scorer (24), calgari (15), vancouv (15)
        topic=4 level=2 (docs=2): mv (58), ah (48), sq (35), q (32), zv (31), hz (30), ri (29), xte (24)
        topic=6 level=2 (docs=12): avail (80), widget (58), version (54), includ (51), server (44), support (42), sun (39), sourc (36)
        topic=7 level=2 (docs=14): drive (31), scsi (28), hd (22), ide

In [12]:
# Access class-level variable directly from the HLDA_Node class
print(HLDA_Node.total_created_nodes)

288


In [None]:
hlda_model7 = HierarchicalLDA(
    corpus=corpus,
    vocabulary=vocab,
    gamma = 10,
    eta = 0.01,
    alpha = 5,
    levels= 3
)

hlda_model7.gibbs_sampling(iterations=2000,
                          topic_display_interval=2000, 
                          top_n_words=8, 
                          show_word_counts=True)

## Synthetic data