In [8]:
import numpy as np
from hlda_utils import *
from hlda_final import HLDA_Node, HierarchicalLDA

from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [9]:
# Fetch the newsgroups dataset (train subset)
newsgroups_data = fetch_20newsgroups(
    subset='train', 
    remove=('headers', 'footers', 'quotes')
)

test_raw = newsgroups_data.data[:50]
test_labels = newsgroups_data.target[:50]

In [10]:
test_filtered_docs, test_filtered_labels, test_vocab, test_word2idx, test_idx2word, test_corpus = full_preprocessing_pipeline(
    test_raw, 
    test_labels, 
    stop_words=None, 
    stemmer=PorterStemmer(), 
    lemmatizer=WordNetLemmatizer(),
    min_word_length=2, 
    min_freq=5
)


hlda_test = HierarchicalLDA(
    corpus= test_corpus,
    vocabulary=test_vocab,
    levels=3,    
    alpha=10.0,  
    gamma=1.0,   
    eta=0.1,     
    seed=42,
    verbose=True
)


Number of documents after filtering empty ones: 49
Number of labels after filtering: 49

Vocabulary size (words with freq >= 5): 207
Sample vocabulary words: ['access', 'accid', 'action', 'add', 'age', 'agre', 'also', 'anoth', 'anyon', 'appear']
Preprecessing done


In [11]:
hlda_test.gibbs_sampling(iterations=50, topic_display_interval=100, top_n_words=3, show_word_counts=True)

Starting Hierarchical LDA sampling

10% done
20% done
30% done
40% done
50% done
60% done
70% done
80% done
90% done
100% done
A total of 17 topic nodes have been created
Gibbs sampling completed


In [12]:
synthetic_corpus, original_doc_paths = generate_synthetic_corpus(
    hlda_model=hlda_test,
    num_docs=100,        # Number of synthetic documents to generate
    doc_length=250,    # Number of tokens per synthetic document
    seed=42
)

print("\nSynthetic Corpus:")
for i, doc in enumerate(synthetic_corpus):
    print(f"Document {i}: {doc[:10]}...")  # Print first 10 tokens for brevity

print("\nOriginal Document Paths:")
for i, path in enumerate(original_doc_paths):
    print(f"Document {i}: {path}")


Synthetic Corpus:
Document 0: [181, 40, 101, 88, 193, 136, 185, 101, 64, 117]...
Document 1: [192, 43, 74, 43, 39, 11, 12, 44, 12, 124]...
Document 2: [88, 43, 124, 39, 88, 200, 204, 180, 81, 156]...
Document 3: [43, 180, 190, 65, 192, 102, 126, 204, 88, 185]...
Document 4: [93, 136, 87, 92, 101, 40, 23, 150, 143, 150]...
Document 5: [103, 145, 117, 85, 87, 91, 23, 43, 170, 92]...
Document 6: [180, 83, 72, 40, 101, 79, 88, 64, 180, 193]...
Document 7: [156, 156, 17, 101, 92, 22, 64, 101, 180, 64]...
Document 8: [23, 100, 193, 111, 202, 66, 111, 192, 101, 74]...
Document 9: [192, 11, 16, 92, 22, 68, 204, 64, 64, 117]...
Document 10: [158, 191, 39, 193, 11, 187, 18, 54, 6, 101]...
Document 11: [103, 185, 156, 136, 138, 72, 88, 12, 181, 101]...
Document 12: [192, 192, 192, 188, 201, 103, 12, 22, 92, 65]...
Document 13: [152, 185, 124, 180, 92, 11, 39, 11, 87, 111]...
Document 14: [197, 6, 40, 64, 64, 64, 143, 87, 130, 192]...
Document 15: [103, 11, 156, 87, 197, 87, 117, 65, 130, 103]...

In [13]:
recovered_hlda = HierarchicalLDA(
    corpus=synthetic_corpus,
    vocabulary=test_vocab,
    levels=3,
    alpha=10.0,
    gamma=1.0,
    eta=0.1,
    seed=42,
    verbose=True
)

# Perform Gibbs Sampling on the recovered model
recovered_hlda.gibbs_sampling(iterations=100, topic_display_interval=5, top_n_words=3, show_word_counts=True)

Starting Hierarchical LDA sampling

*********************The 1th result**************************
topic=0 level=0 (docs=100): use (326), would (294), one (278)
    topic=1 level=1 (docs=17): get (44), would (44), know (37)
        topic=2 level=2 (docs=4): war (67), cut (46), attack (33)
        topic=18 level=2 (docs=8): would (84), system (76), thank (72)
        topic=21 level=2 (docs=5): insur (64), car (46), year (46)
    topic=3 level=1 (docs=26): peopl (90), get (52), add (47)
        topic=4 level=2 (docs=4): starter (121), lefthand (69), better (44)
        topic=5 level=2 (docs=2): avail (26), purchas (23), plea (18)
        topic=20 level=2 (docs=5): car (65), may (45), moral (39)
        topic=23 level=2 (docs=15): thank (100), engin (81), car (73)
    topic=6 level=1 (docs=7): scsi (59), chip (59), burst (56)
        topic=7 level=2 (docs=7): mac (39), pc (36), control (34)
    topic=8 level=1 (docs=26): would (66), cours (64), go (62)
        topic=9 level=2 (docs=17): ti

In [14]:
recovered_hlda.compare_with_original(original_doc_paths)

Accuracy of recovered paths: 0.00%


0.0