In [42]:
from collections import Counter
import random
random.seed(2023 - 4 - 5)

# ***Generative Models and Latent Dirichlet Allocation (LDA)***
-----------------------------------

## ***Generative Models for Text***
------------------

In [43]:
# Imagine a chest that generates texts.
# We could create documents generated from this chest.

In [44]:
with open(r"./The Philosopher's Stone.txt", "r", encoding = "utf-8") as fhandle:
    philosophers_stone = fhandle.read()

In [45]:
sorted([(key, value) for (key, value) in Counter(philosophers_stone.split()).items()], reverse = True, key = lambda pair: pair[1])[:10]

[('the', 3654),
 ('and', 2139),
 ('to', 1827),
 ('a', 1578),
 ('Harry', 1254),
 ('of', 1233),
 ('was', 1150),
 ('he', 1020),
 ('in', 898),
 ('his', 892)]

In [46]:
random.sample(philosophers_stone.split(), 10)

['meet',
 'you,',
 'Weasley',
 'piped',
 'their',
 'presents.',
 'trick',
 'again,',
 'a',
 'think']

In [47]:
# Looking at the sample, (assume thse are from the chest)
# We can say that this chest gives words from Harry Potter books.

In [48]:
# These are 100 random words from the "chest" assembled into a document.

" ".join(random.sample(philosophers_stone.split(), 100))

'Dumbledore the wall much face next back Species went happy. Potter ambling have Ron Harry says be after he He Ron Weasley? victory, on inches. be often he it. always said world. boy, “Call pleased the the the storm. pile cheers; but outside difficulty Maybe other Ron, than Given went finger, along behave private The over about put in blankets tables and Dunno they the Harry’s tried to sight a swallowed I started can’t,” “This sudden, would find climbed of Harry and with wall, and along the they’d or take at that He was “Oh, reminds looking around inches quickly.'

### ***Inference and estimation***
--------------------------
## ***$P(token~|~model)$***

In [49]:
# One could also go the other way around.

# Given a model synthesized document, 
# deduce the frequencies of words,
# create a frequency distribution of words in the synthetic article

In [50]:
print(f"Harry Potter and The Philosopher's Stone book has {len(philosophers_stone.split())} tokens.")

Harry Potter and The Philosopher's Stone book has 83188 tokens.


In [55]:
sorted([(key, value) for (key, value) in Counter(random.sample(philosophers_stone.split(), 1000)).items()],
       reverse = True, key = lambda pair: pair[1])[:10]

[('the', 37),
 ('and', 34),
 ('Harry', 22),
 ('a', 20),
 ('was', 20),
 ('to', 15),
 ('his', 15),
 ('said', 15),
 ('of', 15),
 ('he', 11)]

In [56]:
# Even in a random sample of 1,000 words "Harry" occurs with high frequency
# This scenario is very less likely in any corpus unrelated to Harry Potter

In [57]:
# The generative models can be more complex.
# Instead of one one may have 10 topic models (chests)

# And a magic sampler pulls out tokens from these chests.
# We create a document from tokens pulled from all these 10 chests.
# Now the document has become more complex.

In [58]:
# The generation isn't that hard.

In [60]:
# Let's use three different chests to compose a synthetic document

with open(r"./The Rat Race.txt", "r", encoding = "utf-8") as fhandle:
    world_war2 = fhandle.read().split()
    
with open(r"./Our Vanishing Wild Life - Its Extermination and Preservation.txt", "r", encoding = "utf-8") as fhandle:
    nature = fhandle.read().split()

with open(r"./Beyond Good and Evil.txt", "r", encoding = "utf-8") as fhandle:
    atheism = fhandle.read().split()

In [64]:
tokens = random.sample(world_war2, 1000) + random.sample(nature, 1000) + random.sample(atheism, 1000)
random.shuffle(tokens)

In [66]:
syn_text = " ".join(tokens)

In [81]:
sorted([(key, value) for (key, value) in Counter(random.sample(syn_text.split(), 1000)).items()],
       reverse = True, key = lambda pair: pair[1])[:20]

[('the', 55),
 ('of', 40),
 ('to', 30),
 ('and', 25),
 ('in', 22),
 ('a', 18),
 ('I', 11),
 ('is', 10),
 ('with', 9),
 ('that', 9),
 ('not', 8),
 ('The', 8),
 ('was', 8),
 ('all', 7),
 ('have', 6),
 ('for', 6),
 ('on', 5),
 ('up', 5),
 ('you', 5),
 ('it', 5)]

In [83]:
# In cases like this, one needs to figure out the individual models used to generate the tokens
# and how they were used to compose the document
# Some models have contributed more tokens than others..

In [84]:
# This is called a mixture model

In [85]:
# Our research paper had 4 topic models ->
# Anatomy
# Genetics
# Computation

## ***Latent Dirichlet Allocation (LDA)***
----------------------