In [1]:
import json 
import numpy as np 
import pandas as pd 

## Create a Markov N-gram language model 

In [2]:
data = "/Users/flint/Data/wordlists/generated/gpt-5000.json"
with open(data, 'r') as infile:
    docs = json.load(infile)

In [3]:
corpus = docs['corpus']
np.random.shuffle(corpus)
for document in corpus[:4]:
    print(document)

I am Norman and today I involve all the members of the Society together, and to whom I express my deepest gratitude, and hope in the future.
I am Gene and today I have your name written in the corner  The man I am with is a former convict. You are our friend.
I am Alice and today I feel free. It will not be too hard to make something you can call a life or a career.
I am Daniel and today I be announcing to you that I am a true gentleman, an Atheist!"  His mother did not respond.


### Tokenizer

In [4]:
from nltk.tokenize import word_tokenize

In [5]:
def tokenizer(text):
    return ['#S'] + word_tokenize(text.lower()) + ['#E']

In [6]:
tokens = [tokenizer(doc) for doc in corpus]

## Marlov Language Model

In [7]:
from collections import defaultdict
import nltk 
from tqdm.notebook import tqdm
import numpy as np

In [44]:
class MarkovModel:
    def __init__(self, k: int = 2, alpha: float = 0.0001):
        self.k = k
        self.alpha = alpha
        self.index = defaultdict(lambda: defaultdict(lambda: 0))
    def read(self, doc: list[str]):
        for k_gram in nltk.ngrams(doc, n=self.k, pad_left=True, pad_right=True, left_pad_symbol='[PAD]', right_pad_symbol='[PAD]'):
            prefix, suffix = k_gram[:-1], k_gram[-1]
            self.index[prefix][suffix] += 1
    def read_multi(self, docs: list):
        for doc in tqdm(docs):
            self.read(doc)
    def p(self, w: str, prefix: tuple):
        n = self.index[prefix][w]
        d = sum(self.index[prefix].values())
        if n == 0 or d == 0:
            return self.alpha
        else:
            return n / d
    def eval_prob(self, doc: list):
        probs = []
        for k_gram in nltk.ngrams(doc, n=self.k, pad_left=True, pad_right=True, left_pad_symbol='[PAD]', right_pad_symbol='[PAD]'):
            p = self.p(k_gram[-1], prefix=k_gram[:-1])
            probs.append(np.log(p))
        return sum(probs)
        
    def generate(self, prefix: tuple = None, max_len: int = 1000):
        if prefix is None:
            prefix = tuple(['[PAD]']*(self.k-2) + ['#S'])
        document = [x for x in prefix]
        for i in range(max_len):
            candidates, probabilities = [], []
            for w in self.index[prefix].keys():
                p = self.p(w, prefix)
                candidates.append(w)
                probabilities.append(p)
            new_word = np.random.choice(candidates, p=probabilities)
            document.append(new_word)
            prefix = tuple(document[-(self.k - 1):])
            if new_word == '#E':
                break 
        return document
            
    

In [45]:
model = MarkovModel(k=3)

In [46]:
model.read_multi(tokens)

  0%|          | 0/4043 [00:00<?, ?it/s]

In [47]:
doc = model.generate(max_len=100)
print(" ".join(doc))

[PAD] #S i am david and today i have a choice . #E


In [55]:
s = ["i am david and today i have a choice .", "the cat is on the table .", "i am william and i work on the table ."]
for x in s:
    tokens = tokenizer(x)
    print(x, model.eval_prob(tokens))

i am david and today i have a choice . -12.445900365412808
the cat is on the table . -73.68272297580945
i am william and i work on the table . -53.17286928272703
