In [44]:
from collections import defaultdict
import gzip
import io
import math
import random
import re
import requests

In [51]:
# count-based language model (n-gram model, e.g., n=3 for trigram)
class CountLanguageModel:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = [{} for _ in range(n)]
        self.total_unigrams = 0


    # inference
    def predict_next_token(self, context):
        #
        # uses backoff: 
        #   try largest trigram, and if found no matches, go to second largest trigram, etc...
        #
        for n in range(self.n, 1, -1): # n=3, n=2, then exit (we'll handle unigram specially)
            if len(context) >= n - 1: # need to have enough tokens in context for this n-gram
                context_n = tuple(context[-(n - 1):]) # grab the last n tokens from context (n-gram)
                counts = self.ngram_counts[n - 1].get(context_n) # how many times did this n-gram appear?
                if counts:
                    return max(counts.items(), key = lambda x: x[1])[0] # if found, return the token that followed the n-gram the most

        # special case: unigram
        unigram_counts = self.ngram_counts[0].get(())
        if unigram_counts:
            return max(unigram_counts.items(), key=lambda x: x[1])[0]

        # no matches
        return None


    # training
    def train(self, tokens):
        self.total_unigrams = len(tokens)
        for n in range(1, self.n + 1):
            counts = self.ngram_counts[n - 1]
            for i in range(len(tokens) - n + 1):
                context = tuple(tokens[i:i + n - 1])
                next_token = tokens[i + n - 1]
                if context not in counts:
                    counts[context] = defaultdict(int)
                counts[context][next_token] += 1 


    # performance metrics
    def get_probability(self, token, context):
        # n-gram n>1 (handle unigram separately)
        for n in range(self.n, 1, -1):
            if len(context) >= n - 1:
                context_n = tuple(context[-(n - 1):])
                counts = self.ngram_counts[n - 1].get(context_n)
                if counts:
                    total = sum(counts.values())
                    count = counts.get(token, 0)
                    if count > 0:
                        return count / total
        # unigram 
        unigram_counts = self.ngram_counts[0].get(())
        count = unigram_counts.get(token, 0)
        V = len(unigram_counts)
        return (count + 1) / (self.total_unigrams + V)

In [52]:
# source: https://github.com/aburkov/theLMbook/blob/main/count_language_model.ipynb
def download_corpus(url):
    """
    Downloads and decompresses a gzipped corpus file from the given URL.

    Args:
        url (str): URL of the gzipped corpus file

    Returns:
        str: Decoded text content of the corpus

    Raises:
        HTTPError: If the download fails
    """
    print(f"Downloading corpus from {url}...")
    response = requests.get(url)
    response.raise_for_status()  # Raises an exception for bad HTTP responses

    print("Decompressing and reading the corpus...")
    with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as f:
        corpus = f.read().decode('utf-8')

    print(f"Corpus size: {len(corpus)} characters")
    return corpus


# source: https://github.com/aburkov/theLMbook/blob/main/count_language_model.ipynb
def download_and_prepare_data(data_url):
    """
    Downloads and prepares training and test data.

    Args:
        data_url (str): URL of the corpus to download

    Returns:
        tuple: (training_tokens, test_tokens) split 90/10
    """
    # Download and extract the corpus
    corpus = download_corpus(data_url)

    # Convert text to tokens
    tokens = tokenize(corpus)

    # Split into training (90%) and test (10%) sets
    split_index = int(len(tokens) * 0.9)
    train_corpus = tokens[:split_index]
    test_corpus = tokens[split_index:]

    return train_corpus, test_corpus

# source: https://github.com/aburkov/theLMbook/blob/main/count_language_model.ipynb
def tokenize(text):
    """
    Tokenizes text into words and periods.

    Args:
        text (str): Input text to tokenize

    Returns:
        list: List of lowercase tokens matching words or periods
    """
    return re.findall(r"\b[a-zA-Z0-9]+\b|[.]", text.lower())

In [53]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# train the model
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
random.seed(42)
n = 5 
data_url = "https://www.thelmbook.com/data/brown"
train_corpus, test_corpus = download_and_prepare_data(data_url) # TODO:

model = CountLanguageModel(n)
model.train(train_corpus)

Downloading corpus from https://www.thelmbook.com/data/brown...
Decompressing and reading the corpus...
Corpus size: 6185606 characters


In [54]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# test the model
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
contexts = [
    "i will build a",
    "the best place to",
    "she was riding a"
]

for context in contexts:
    words = tokenize(context)
    next_word = model.predict_next_token(words)
    print(f"\nContext: {context}")
    print(f"Next token: {next_word}")


Context: i will build a
Next token: wall

Context: the best place to
Next token: live

Context: she was riding a
Next token: horse


In [55]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# evaluate model performance
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
def compute_perplexity(model, tokens, context_size):
    if not tokens:
        return float('inf')
    total_log_likelihood = 0
    num_tokens = len(tokens)
    for i in range(num_tokens):
        context_start = max(0, i - context_size)
        context = tuple(tokens[context_start:i])
        word = tokens[i]
        probability = model.get_probability(word, context)
        total_log_likelihood += math.log(probability)
    average_log_likelihood = total_log_likelihood / num_tokens
    return math.exp(-average_log_likelihood)

perplexity = compute_perplexity(model, test_corpus, n)
print(f"\nPerplexity on test corpus: {perplexity:.2f}")


Perplexity on test corpus: 299.06


In [60]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# generate conversation
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
tokens = tokenize("i will build a")
for i in range(100):
    next_token = model.predict_next_token(tokens)
    tokens.append(next_token)
print(" ".join(tokens))

i will build a wall to keep the people in and added so long as people rebel we must not give up . martin called for patience on the part of the coaches and the players . we needed it and we got it . meek expressed particular gratification at the defensive performances of end happy nelson and halfback billy gannon . both turned in top jobs for the second straight game . nelson played magnificent football meek praised . he knocked down the interference and made key stops lots of times . and he caused the fumble that set up our touchdown .
