# Implementation of Paper - Integrating Topics and Syntax
- [Paper](https://cocosci.princeton.edu/tom/papers/composite.pdf)
- Implementation is adapted from the simple LDA model training as described in this [blog](https://www.depends-on-the-definition.com/lda-from-scratch/#how-do-we-find-theta-and-varphi-gibbs-sampling)

## Loading the dataset

In [None]:
import sys
sys.path.append('../')
from datasets.newsgroup import NewsgroupDataset

ds = NewsgroupDataset(num_docs=100, rare_words_threshold=1)
tokenised_docs, vocab_map = ds.get_tokenized_docs_and_vocab_map()

## Hyperparameters

In [None]:
T = 50
C = 20
V = len(vocab_map)
D = len(tokenised_docs)

alpha = 1/T
beta = 1/V
delta = 1/V
gamma = 1/C

print("alpha: ", alpha)
print("beta: ", beta)
print("delta: ", delta)
print("gamma: ", gamma)

## HMM model

In [None]:
from models.hmm import HMM

hmm = HMM(
    num_topics=T,
    num_classes=C,
    vocab_map=vocab_map,
    alpha=alpha,
    beta=beta,
    delta=delta,
    gamma=gamma
)

## HMM training

In [None]:
from models.hmm import HMMTrainer

trainer = HMMTrainer(hmm)
trainer.train(tokenised_docs, num_iterations=100)

### Saving & loading model

In [None]:
hmm.save('checkpoints/hmm_100_docs_100_iterations')
del hmm
hmm = HMM.load('checkpoints/hmm_100_docs_100_iterations')

## Generating documents with trained model

In [None]:
doc_generator = hmm.get_document_generator()
doc_len = 100

for doc_id in range(10):
    doc = ' '.join((doc_generator(doc_len)))
    print(f'Doc {doc_id}: {doc}')

### Topics

In [None]:
k = 20
for topic_id in range(hmm.num_topics):
    print(f'Topic {topic_id}: {" ".join(hmm.get_top_k_words_from_topic(topic_id, k))}')


### Classes

In [None]:
k = 20
for class_id in range(hmm.num_classes):
    print(f'Class {class_id}: {" ".join(hmm.get_top_k_words_from_class(class_id, k))}')