# N-Gram Language Modeling 

In this notebook, we will implement a stastical method called "ngram" to model a language.
Regardless of its limitations, it's still a good place to start learning language modeling.

In [6]:
! pip install requests nltk --quiet


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# Let's setupa  function to fetch us a corpus of text

def fetch_shakespeare():
    import requests
    url = "https://www.gutenberg.org/files/100/100-0.txt"
    response = requests.get(url)
    return response.text

In [8]:
corpus = fetch_shakespeare()

In [11]:
# Initial EDA and Statistics on the corpus
print("--- Naive Statistics ---")
print("Length of the corpus: ", len(corpus))
print("Number of words in the corpus: ", len(corpus.split()))
print("Number of unique words in the corpus: ", len(set(corpus.split())))
print("Number of characters in the corpus: ", len(set(corpus)))
print("Number of lines in the corpus: ", len(corpus.split("\n")))
print("Number of paragraphs in the corpus: ", len(corpus.split("\n\n")))
print("Number of sentences in the corpus: ", len(corpus.split(".")))


--- Naive Statistics ---
Length of the corpus:  5555456
Number of words in the corpus:  963478
Number of unique words in the corpus:  71167
Number of characters in the corpus:  101
Number of lines in the corpus:  196018
Number of paragraphs in the corpus:  1
Number of sentences in the corpus:  91163


In [173]:
# Preprocessing the corpus
import nltk
nltk.download('punkt')

def preprocess_text(text):
    import re
    # Remove Project Gutenberg's header and footer
    text = text[50:-55]
    # lowercase the text
    text = text.lower()
    # remove non-alphabetic characters and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Tokenize using NLTK
    tokens = nltk.word_tokenize(text)

    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Abdulmunim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [174]:
tokens = preprocess_text(corpus)

In [175]:
print(len(tokens))
print(tokens[-10:])
print(tokens[:10])
# average length
print("Average length of a token: ", sum([len(token) for token in tokens])/len(tokens))

963637
['their', 'queen', 'means', 'to', 'immure', 'herself', 'and', 'not', 'be', 'seen']
['the', 'complete', 'works', 'of', 'william', 'shakespeare', 'by', 'william', 'shakespeare', 'contents']
Average length of a token:  4.210134106515213


In [178]:
# Let's generate n-grams from list of tokens

def generate_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens)-n+1):
        ngrams.append(tuple(tokens[i:i+n]))
    return ngrams

n = 6
ngrams = generate_ngrams(tokens, n)

In [179]:
len(ngrams)

963632

In [180]:
# Build the ngram model (Frequency distributions and conditional probablity tables for the n-grams)

def build_ngram_model(tokens, n, ngrams=None):
    from collections import Counter, defaultdict
    model = defaultdict(Counter)
    if ngrams is None:
        ngrams = generate_ngrams(tokens, n)
    for ngram in ngrams:
        prefix = ngram[:-1]
        suffix = ngram[-1]
        model[prefix][suffix] += 1
    return model

ngram_model = build_ngram_model(tokens, n=4, ngrams=ngrams)


In [181]:
# Let's normalize the probabilities

def normalize_model(model):
    normalized_model = {}
    for prefix, suffixes in model.items():
        total = float(sum(suffixes.values()))
        normalized_suffixes = {}
        for suffix, count in suffixes.items():
            normalized_suffixes[suffix] = count/total
        normalized_model[prefix] = normalized_suffixes
    return normalized_model

normalized_model = normalize_model(ngram_model)

In [182]:
len(normalized_model)

953106

In [183]:
# Let's attempt to generate some text using the n-gram model

def generate_text(model, n, num_words, start_prefix=None):
    import random
    if start_prefix is None:
        start_prefix = random.choice(list(model.keys()))
    else:
        start_prefix = tuple(start_prefix.lower().split())
        if len(start_prefix) != n - 1:
            raise ValueError(f"Start prefix must be {n - 1} words long.")

    generated = list(start_prefix)

    for _ in range(num_words):
        prefix = tuple(generated[-(n-1):])
        suffixes = model.get(prefix, None)
        if not suffixes:
            break  # Cannot continue if there is no entry for this prefix
        suffix = random.choices(list(suffixes.keys()), weights=suffixes.values())[0]
        generated.append(suffix)

    return ' '.join(generated)


In [202]:
generated_text = generate_text(normalized_model, 6, 10, "by my hand i swear")

In [203]:
print(generated_text)

by my hand i swear and my fathers soul the work ish ill done it
