# N-gram Language Models

The main objective of this notebook is to implement an n-gram language model and evaluate its performance. This model will subsequently be applied to text generation, automatic correction, and auto-completion.

In [1]:
import re
import math
import string
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import nltk
nltk.download('punkt')
from collections import defaultdict

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Method: `prepare_data(infile, operation)`

This method processes a text file as input, representing a corpus, and performs several preprocessing steps. It tokenizes the text into words, normalizes the text to lowercase, and adds start (`<s>`) and end (`</s>`) tokens to each sentence for N-gram modeling. Additionally, it handles out-of-vocabulary words by identifying those that appear fewer than a specified threshold (`N`) times in the training data and replaces them with the `<UNK>` token. The method returns the preprocessed corpus as a single string with tokens separated by spaces.


In [2]:
vocabular = {}
count_threshold = 2
ngram_size = 3

def prepare_data(infile, operation):
        global vocabular # Explicitly use the global variable
        corpus = ""
        pattern = r'^(\/)?([^\/\0]+(\/)?)+\.[^\/\0]+$'

        # Input handling: check if infile is a file or raw text
        if operation or re.match(pattern, infile):
            with open(infile, 'r', encoding="utf8") as file:
                corpus = file.read()
        else:
            corpus = infile

        newtokens = []
        token_counts = {}
        unk_words = []

        # Preprocess the entire corpus
        corpus = corpus.lower()
        corpus = re.sub(r"http\S+|www\S+|https\S+", '', corpus, flags=re.MULTILINE)  # Remove URLs

        # Tokenize the corpus into sentences
        sentences = sent_tokenize(corpus)

        # Process each sentence
        for sentence in sentences:
            # Add start tokens for N-gram modeling
            newtokens.extend(['<s>'] * (ngram_size - 1))

            # Tokenize sentence into words
            words = word_tokenize(sentence)

            # Handle date tokens separately
            for token in words:
                dates = re.findall(r'(.*)((?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/(?:\d{2}|\d{4}))(.*)', token)
                if dates:
                    for items in dates:
                        for date in items:
                            if re.match(r"(?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/(?:\d{2}|\d{4})", date):
                                newtokens.append(date)
                                token_counts[date] = token_counts.get(date, 0) + 1
                            else:
                                date_tokens = word_tokenize(date)
                                newtokens.extend(date_tokens)
                                for x in date_tokens:
                                    token_counts[x] = token_counts.get(x, 0) + 1
                else:
                    newtokens.append(token)
                    token_counts[token] = token_counts.get(token, 0) + 1

            # Add end-of-sentence token
            newtokens.append('</s>')

        # Handle unknown tokens
        if operation:
            for i, token in enumerate(newtokens):
                if token_counts.get(token, 0) < count_threshold and token not in ['<s>', '</s>']:
                    unk_words.append(token)
                    token_counts['<UNK>'] = token_counts.get('<UNK>', 0) + 1
                    newtokens[i] = '<UNK>'

            vocabular = token_counts
            return ' '.join(newtokens)
        else:
            for i, token in enumerate(newtokens):
                if token not in vocabular and token not in ['<s>', '</s>']:
                    unk_words.append(token)
                    newtokens[i] = '<UNK>'

            return ' '.join(newtokens), unk_words

In [3]:
prepare_data('ngramv1.train', True)

'<s> <s> i am sam . </s> <s> <s> i am sam . </s> <s> <s> sam i am . </s> <s> <s> that sam i am ! </s> <s> <s> that sam i am ! </s> <s> <s> i do not like that sam i am ! </s> <s> <s> do would you like green eggs and ham ? </s> <s> <s> i do not like them , sam i am . </s> <s> <s> i do not like green eggs and ham . </s> <s> <s> would you like them here or there ? </s> <s> <s> i would not like them here or there . </s> <s> <s> i would not like them anywhere . </s> <s> <s> i do not like green eggs and ham . </s> <s> <s> i do not like them , sam i am . </s> <s> <s> would you like them in a house ? </s> <s> <s> would you like <UNK> with a mouse ? </s> <s> <s> i do not like them in a house . </s> <s> <s> i do not like them with a mouse . </s> <s> <s> i do not like them here or there . </s> <s> <s> i do not like them anywhere . </s> <s> <s> i do not like green eggs and ham . </s> <s> <s> i do not like them , sam i am . </s> <s> <s> would you eat them in a box ? </s> <s> <s> would you eat them w

In [4]:
vocabular

{'i': 68,
 'am': 14,
 'sam': 17,
 '.': 72,
 'that': 3,
 '!': 29,
 'do': 35,
 'not': 82,
 'like': 41,
 'would': 25,
 'you': 31,
 'green': 8,
 'eggs': 9,
 'and': 11,
 'ham': 9,
 '?': 16,
 'them': 48,
 ',': 34,
 'here': 10,
 'or': 8,
 'there': 8,
 'anywhere': 7,
 'in': 33,
 'a': 47,
 'house': 7,
 'then': 1,
 'with': 15,
 'mouse': 6,
 'eat': 14,
 'box': 6,
 'fox': 6,
 'could': 14,
 'car': 6,
 'they': 1,
 'are': 1,
 'may': 4,
 'will': 11,
 'see': 3,
 'tree': 5,
 'let': 4,
 'me': 4,
 'be': 4,
 'train': 7,
 'on': 5,
 'greem': 1,
 'say': 3,
 'the': 9,
 'dark': 6,
 'rain': 3,
 'goat': 3,
 'boat': 2,
 'so': 1,
 'try': 4,
 'if': 1,
 '<UNK>': 6}

## Method: `train(infile)`

This method calculates the probabilities of n-grams. It uses the `prepare_data` method for preprocessing the input.

Our model takes into account the following three points:
- Handles out-of-vocabulary words.
- Applies smoothing (uses add-k smoothing in this calculation).
- Converts probabilities to logarithms, to avoid floating-point overflow issues.

In [5]:
n_gram_counts = {}
nminus1_gram_counts = {}
k = 0.01
ngram_probabilities = {}
def train(infile=None):
    global n_gram_counts, nminus1_gram_counts, ngram_probabilities
    # Preprocess the input corpus
    corpus = prepare_data(infile, True)

    # Extract sentences from the preprocessed corpus
    sentences = re.findall(r"<s>.+?</s>", corpus)

    # Step 1: Count n-grams and (n-1)-grams
    for captured_sentence in sentences:
        tokens = captured_sentence.split(" ")

        for i in range(len(tokens) - (ngram_size - 1)):
            # Extract n-gram and n-1-gram
            n_gram = tokens[i:i + ngram_size]
            n_gram_tuple = tuple(n_gram)
            nminus1_gram_tuple = tuple(n_gram[:-1])

            # Update counts
            n_gram_counts[n_gram_tuple] = n_gram_counts.get(n_gram_tuple, 0) + 1
            nminus1_gram_counts[nminus1_gram_tuple] = nminus1_gram_counts.get(nminus1_gram_tuple, 0) + 1

    # Step 2: Add-k smoothing
    vocab_size = len(vocabular)
    ngram_probabilities = {}

    for ngram, count in n_gram_counts.items():
        n_minus1_gram = ngram[:-1]
        count_nminus1 = nminus1_gram_counts.get(n_minus1_gram, 0)

        # Add-k smoothing: P(wi | wi-1, ..., wi-(n-1)) = (count(wi-1,...,wi) + k) / (count(wi-1,...,wi-(n-1)) + k * V)
        smoothed_prob = (count + k) / (count_nminus1 + k * vocab_size)

        # Logarithmic probabilities
        ngram_probabilities[ngram] = math.log(smoothed_prob)

    # Save the probabilities for the model
    ngram_probabilities = ngram_probabilities

In [6]:
train('ngramv1.train')

In [7]:
n_gram_counts

{('<s>', '<s>', 'i'): 54,
 ('<s>', 'i', 'am'): 2,
 ('i', 'am', 'sam'): 2,
 ('am', 'sam', '.'): 2,
 ('sam', '.', '</s>'): 2,
 ('<s>', '<s>', 'sam'): 3,
 ('<s>', 'sam', 'i'): 1,
 ('sam', 'i', 'am'): 12,
 ('i', 'am', '.'): 9,
 ('am', '.', '</s>'): 9,
 ('<s>', '<s>', 'that'): 2,
 ('<s>', 'that', 'sam'): 2,
 ('that', 'sam', 'i'): 3,
 ('i', 'am', '!'): 3,
 ('am', '!', '</s>'): 3,
 ('<s>', 'i', 'do'): 32,
 ('i', 'do', 'not'): 32,
 ('do', 'not', 'like'): 33,
 ('not', 'like', 'that'): 1,
 ('like', 'that', 'sam'): 1,
 ('<s>', '<s>', 'do'): 1,
 ('<s>', 'do', 'would'): 1,
 ('do', 'would', 'you'): 1,
 ('would', 'you', 'like'): 4,
 ('you', 'like', 'green'): 1,
 ('like', 'green', 'eggs'): 7,
 ('green', 'eggs', 'and'): 8,
 ('eggs', 'and', 'ham'): 9,
 ('and', 'ham', '?'): 2,
 ('ham', '?', '</s>'): 2,
 ('not', 'like', 'them'): 28,
 ('like', 'them', ','): 9,
 ('them', ',', 'sam'): 9,
 (',', 'sam', 'i'): 8,
 ('not', 'like', 'green'): 6,
 ('and', 'ham', '.'): 6,
 ('ham', '.', '</s>'): 6,
 ('<s>', '<s>', 'w

In [8]:
ngram_probabilities

{('<s>', '<s>', 'i'): -0.7776945603812787,
 ('<s>', 'i', 'am'): -3.3009829902080954,
 ('i', 'am', 'sam'): -1.9794562715465174,
 ('am', 'sam', '.'): -0.23795863709935042,
 ('sam', '.', '</s>'): -0.23795863709935042,
 ('<s>', '<s>', 'sam'): -3.6649236962252942,
 ('<s>', 'sam', 'i'): -1.2569972726341563,
 ('sam', 'i', 'am'): -0.043981029485900776,
 ('i', 'am', '.'): -0.47925592199725514,
 ('am', '.', '</s>'): -0.05820608287239237,
 ('<s>', '<s>', 'that'): -4.068729052915094,
 ('<s>', 'that', 'sam'): -0.23795863709935042,
 ('that', 'sam', 'i'): -0.1650075247265401,
 ('i', 'am', '!'): -1.5756509148567173,
 ('am', '!', '</s>'): -0.1650075247265401,
 ('<s>', 'i', 'do'): -0.5330693582973082,
 ('i', 'do', 'not'): -0.016729014672806554,
 ('do', 'not', 'like'): -0.04559700434857725,
 ('not', 'like', 'that'): -3.560989825396082,
 ('like', 'that', 'sam'): -0.42830460007798715,
 ('<s>', '<s>', 'do'): -4.75691344413291,
 ('<s>', 'do', 'would'): -0.42830460007798715,
 ('do', 'would', 'you'): -0.428304

## Method: `predict_ngram(sentence)`

This method takes a sentence (as a string) as input, uses the `prepare_data` method for preprocessing, and then calculates the probability of the sentence using the n-gram language model, depending on the value of the `ngram_size` parameter.

In [9]:
def predict_ngram(sentence):
      log_proba = 0
      preprocessed_corpus, _ = prepare_data(sentence, False)
      sentences = re.findall(r"<s>.+?</s>", preprocessed_corpus)

      for captured_sentence in sentences:
          tokens = captured_sentence.split(" ")

          # Calculate the log probability for each n-gram in the sentence
          for i in range(len(tokens) - (ngram_size - 1)):
              n_gram = tokens[i:i + ngram_size]
              n_gram_tuple = tuple(n_gram)
              nminus1_gram_tuple = tuple(n_gram[:-1])

              # Use add-k smoothing to calculate log probability
              count_n_gram = n_gram_counts.get(n_gram_tuple, 0)
              count_nminus1_gram = nminus1_gram_counts.get(nminus1_gram_tuple, 0)

              smoothed_prob = (count_n_gram + k) / (count_nminus1_gram + k * len(vocabular))
              log_proba += math.log(smoothed_prob)

      return log_proba, preprocessed_corpus

In [10]:
predict_ngram('I am happy')

(-15.368771915427438, '<s> <s> i am <UNK> </s>')

Note: The log probability of -15.36 indicates that the sequence is relatively unlikely due to the presence of the unknown token.

## Method: `test_perplexity(test_file)`

This method takes the path of a testing corpus and calculates the perplexity based on this test corpus.


In [11]:
def test_perplexity(test_file):
        log_proba, preprocessed_corpus = predict_ngram(test_file)
        test_count_tokens = len(preprocessed_corpus.split(" "))
        return math.exp(-log_proba / test_count_tokens) if test_count_tokens > 0 else float('inf')

In [12]:
test_perplexity('ngramv1.test')

10.33709766718842

Note: The perplexity of approximately 10.33 indicates that the model has a reasonable understanding of the sequence.


Now, we will use a larger dataset to train our model, which will include three additional functionalities:

### Method: `generateText`

This method generates a sentence using an n-gram model, starting from the `<s>` tokens and sampling the next word based on its precomputed probability in `self.ngram_probabilities`.

The process continues until the end token `</s>` is generated or the `max_length` is reached.

### Method: `autoComplete`

This method predicts the most probable next word based on the input text using n-gram probabilities.

### Method: `correction`

This method implements a variant of a spell checker using an n-gram language model to rank possible corrections based on their probabilities from the language model and their minimum edit distance. It corrects misspelled words in the input sentence based on known vocabulary and n-gram probabilities.


## Function Definitions
To accomplish this, we will utilize the following functions. For a better understanding, please refer to the Spelling Corrector notebook available in the same repository.

In [13]:
def edits1(s):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'

    # Split the string into all possible pairs of prefixes and suffixes
    splits = [(s[:i], s[i:]) for i in range(len(s) + 1)]

    # Generate words after deleting each letter
    deletes = [a + b[1:] for a, b in splits if b]

    # Generate words after inserting each letter of the alphabet at every position
    inserts = [a + c + b for a, b in splits for c in alphabet]

    # Generate words after replacing each letter with every letter of the alphabet
    replaces = [a + c + b[1:] for a, b in splits if b for c in alphabet]

    # Generate words after transposing two letters in the word
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]

    # Return a set of unique words
    return set(deletes + inserts + replaces + transposes)

def edits2(word):
    # Return the set of all words that are two edits away from the given word
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

def knownWord(words, vocabulary):
    # Get the set of valid words from the dictionary
    return set(w for w in words if w in vocabulary)

In [14]:
class NgramLanguageModel:

    def __init__(self, ngram_size=2, count_threshold=2):
        self.ngram_size = ngram_size
        self.count_threshold = count_threshold
        self.n_gram_counts = {}
        self.nminus1_gram_counts = {}
        self.vocabular = {}
        self.k = 0.01
        self.ngram_probabilities = {}

    def prepare_data(self, infile, operation):
        """Prepare and preprocess the input data."""
        corpus = ""
        pattern = r'^(\/)?([^\/\0]+(\/)?)+\.[^\/\0]+$'

        # Input handling: check if infile is a file or raw text
        if operation or re.match(pattern, infile):
            with open(infile, 'r', encoding="utf8") as file:
                corpus = file.read()
        else:
            corpus = infile

        newtokens = []
        token_counts = {}
        unk_words = []

        # Preprocess the entire corpus
        corpus = corpus.lower()
        corpus = re.sub(r"http\S+|www\S+|https\S+", '', corpus, flags=re.MULTILINE)  # Remove URLs

        # Tokenize the corpus into sentences
        sentences = sent_tokenize(corpus)

        # Process each sentence
        for sentence in sentences:
            # Add start tokens for N-gram modeling
            newtokens.extend(['<s>'] * (self.ngram_size - 1))

            # Tokenize sentence into words
            words = word_tokenize(sentence)

            # Handle date tokens separately
            for token in words:
                dates = re.findall(r'(.*)((?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/(?:\d{2}|\d{4}))(.*)', token)
                if dates:
                    for items in dates:
                        for date in items:
                            if re.match(r"(?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/(?:\d{2}|\d{4})", date):
                                newtokens.append(date)
                                token_counts[date] = token_counts.get(date, 0) + 1
                            else:
                                date_tokens = word_tokenize(date)
                                newtokens.extend(date_tokens)
                                for x in date_tokens:
                                    token_counts[x] = token_counts.get(x, 0) + 1
                else:
                    newtokens.append(token)
                    token_counts[token] = token_counts.get(token, 0) + 1

            # Add end-of-sentence token
            newtokens.append('</s>')

        # Handle unknown tokens
        if operation:
            for i, token in enumerate(newtokens):
                if token_counts.get(token, 0) < self.count_threshold and token not in ['<s>', '</s>']:
                    unk_words.append(token)
                    token_counts['<UNK>'] = token_counts.get('<UNK>', 0) + 1
                    newtokens[i] = '<UNK>'

            self.vocabular = token_counts
            return ' '.join(newtokens)
        else:
            for i, token in enumerate(newtokens):
                if token not in self.vocabular and token not in ['<s>', '</s>']:
                    unk_words.append(token)
                    newtokens[i] = '<UNK>'

            return ' '.join(newtokens), unk_words

    def train(self, infile=None):
        """
        Train the n-gram model using the prepared data.
        Applies smoothing, handles OOV words, and calculates logarithmic probabilities.
        """
        # Preprocess the input corpus
        corpus = self.prepare_data(infile, True)

        # Extract sentences from the preprocessed corpus
        sentences = re.findall(r"<s>.+?</s>", corpus)

        # Step 1: Count n-grams and (n-1)-grams
        for captured_sentence in sentences:
            tokens = captured_sentence.split(" ")

            for i in range(len(tokens) - (self.ngram_size - 1)):
                # Extract n-gram and n-1-gram
                n_gram = tokens[i:i + self.ngram_size]
                n_gram_tuple = tuple(n_gram)
                nminus1_gram_tuple = tuple(n_gram[:-1])

                # Update counts
                self.n_gram_counts[n_gram_tuple] = self.n_gram_counts.get(n_gram_tuple, 0) + 1
                self.nminus1_gram_counts[nminus1_gram_tuple] = self.nminus1_gram_counts.get(nminus1_gram_tuple, 0) + 1

        # Step 2: Add-k smoothing
        vocab_size = len(self.vocabular)
        ngram_probabilities = {}

        for ngram, count in self.n_gram_counts.items():
            n_minus1_gram = ngram[:-1]
            count_nminus1 = self.nminus1_gram_counts.get(n_minus1_gram, 0)

            # Add-k smoothing: P(wi | wi-1, ..., wi-(n-1)) = (count(wi-1,...,wi) + k) / (count(wi-1,...,wi-(n-1)) + k * V)
            smoothed_prob = (count + self.k) / (count_nminus1 + self.k * vocab_size)

            # Logarithmic probabilities
            ngram_probabilities[ngram] = math.log(smoothed_prob)

        # Save the probabilities for the model
        self.ngram_probabilities = ngram_probabilities

    def predict_ngram(self, sentence):
      """
      Predict the log probability of a given sentence based on n-gram counts.

      """
      log_proba = 0
      preprocessed_corpus, _ = self.prepare_data(sentence, False)
      sentences = re.findall(r"<s>.+?</s>", preprocessed_corpus)

      for captured_sentence in sentences:
          tokens = captured_sentence.split(" ")

          # Calculate the log probability for each n-gram in the sentence
          for i in range(len(tokens) - (self.ngram_size - 1)):
              n_gram = tokens[i:i + self.ngram_size]
              n_gram_tuple = tuple(n_gram)
              nminus1_gram_tuple = tuple(n_gram[:-1])

              # Use add-k smoothing to calculate log probability
              count_n_gram = self.n_gram_counts.get(n_gram_tuple, 0)
              count_nminus1_gram = self.nminus1_gram_counts.get(nminus1_gram_tuple, 0)

              smoothed_prob = (count_n_gram + self.k) / (count_nminus1_gram + self.k * len(self.vocabular))
              log_proba += math.log(smoothed_prob)

      return log_proba, preprocessed_corpus

    def test_perplexity(self, test_file):
        """Calculate the perplexity of the test file."""
        log_proba, preprocessed_corpus = self.predict_ngram(test_file)
        test_count_tokens = len(preprocessed_corpus.split(" "))
        return math.exp(-log_proba / test_count_tokens) if test_count_tokens > 0 else float('inf')

    def generate_text(self, max_length=20):
      """
      Generate a sentence using an n-gram model, starting from the <s> tokens
      and sampling the next word based on its precomputed probability in self.ngram_probabilities.

      The process continues until the end token </s> is generated or the max_length is reached.

      Parameters:
      - max_length: The maximum length of the generated sentence to avoid infinite loops.

      Returns:
      - generated_sentence: The generated sentence as a string.
      """
      # Initialize with the start token sequence <s>, based on ngram_size-1
      current_ngram = tuple(["<s>"] * (self.ngram_size - 1))
      generated_sentence = list(current_ngram)  # List to accumulate generated words

      while current_ngram[-1] != "</s>" and len(generated_sentence) < max_length:
          # Collect all n-grams that start with the current ngram (n-1 words)
          next_word_candidates = {ngram[-1]: prob for ngram, prob in self.ngram_probabilities.items()
                                  if ngram[:-1] == current_ngram}

          if not next_word_candidates:
              # If no valid candidates are found (which can happen with unknown tokens), break the loop
              break

          # Normalize probabilities for the next word
          total_prob = sum(math.exp(prob) for prob in next_word_candidates.values())
          next_word_probabilities = {word: math.exp(prob) / total_prob for word, prob in next_word_candidates.items()}

          # Sample the next word based on the precomputed probabilities
          next_word = np.random.choice(list(next_word_probabilities.keys()), p=list(next_word_probabilities.values()))

          # Ensure that the sampled word is not <UNK>
          while next_word == "<UNK>":
              next_word = np.random.choice(list(next_word_probabilities.keys()), p=list(next_word_probabilities.values()))

          # Add the next word to the generated sentence
          generated_sentence.append(next_word)

          # Update the current ngram by shifting it to include the new word
          current_ngram = tuple(generated_sentence[-(self.ngram_size - 1):])

      # Join the generated words into a sentence and return, excluding <s> and </s> tokens
      return ' '.join(generated_sentence[self.ngram_size-1:-1])

    def auto_complete(self, sentence):
      """
      Predict the most probable next word based on the input text using n-gram probabilities.

      Parameters:
      - sentence: The input sentence for which the next word needs to be predicted.

      Returns:
      - complete_sentence: The input sentence followed by the most probable next word.
      """
      if not sentence:
          return "<UNK>"

      # Preprocess the input text
      preprocessed_sentence, _ = self.prepare_data(sentence, False)

      tokens = preprocessed_sentence.split(" ")
      filtered_tokens = [token for token in tokens if token != "</s>"]

      # Get the last n-1 tokens for n-gram prediction
      current_ngram = filtered_tokens[-(self.ngram_size - 1):]

      # Pad with <s> if not enough tokens are available
      if len(current_ngram) < self.ngram_size - 1:
          current_ngram = ["<s>"] * ((self.ngram_size - 1) - len(current_ngram)) + current_ngram

      next_word = ""

      # Collect all n-grams that start with the current n-1 tokens
      next_word_candidates = {ngram[-1]: prob for ngram, prob in self.ngram_probabilities.items()
                              if ngram[:-1] == tuple(current_ngram)}

      # Normalize probabilities for the next word
      total_prob = sum(math.exp(prob) for prob in next_word_candidates.values())
      next_word_probabilities = {word: math.exp(prob) / total_prob for word, prob in next_word_candidates.items()}

      # Find the next word with the maximum probability
      next_word = max(next_word_probabilities, key=next_word_probabilities.get)

      # Return the complete sentence with the predicted next word
      if next_word:
          return sentence.strip() + " " + next_word

      return "<UNK>"

    def correction(self, sentence):
      """
      Corrects misspelled words in the input sentence based on known vocabulary and n-gram probabilities.

      Parameters:
          sentence (str): The input sentence with potential misspelled words.

      Returns:
          str: The corrected sentence with the most probable word substitutions based on n-gram probabilities.
      """
      # Prepare the data (preprocess and get unknown words)
      preprocessed_sentence, unk_words = self.prepare_data(sentence, False)

      # Dictionary to store the corrections for unknown words
      corrections = {}

      # Iterate through each unknown word
      for unk in unk_words:
          # List to store candidates for correction
          candidate_list = []

          # Get known words with an edit distance of 1
          e1_words = edits1(unk)
          valid_e1 = knownWord(e1_words, self.vocabular)
          if valid_e1:
              candidate_list.extend(valid_e1)

          # Get known words with an edit distance of 2
          e2_words = edits2(unk)
          valid_e2 = knownWord(e2_words, self.vocabular)
          if valid_e2:
              candidate_list.extend(valid_e2)

          # Add the original unknown word to the candidate list
          candidate_list.append(unk)

          # Choose the most probable candidate based on n-gram probabilities
          corrections[unk] = max(candidate_list, key=lambda x: self.ngram_probabilities.get((x,), 0))

      # Tokenize the preprocessed sentence
      tokens = preprocessed_sentence.split(" ")

      # Replace each "<UNK>" with its correction
      corrected_tokens = []
      unk_index = 0
      for token in tokens:
          if token == "<UNK>":
              corrected_tokens.append(corrections[unk_words[unk_index]])
              unk_index += 1
          else:
              corrected_tokens.append(token)

      # Join the corrected tokens back into a sentence
      return ' '.join(corrected_tokens[self.ngram_size-1:-1])


In [15]:
ngram = NgramLanguageModel()

In [16]:
ngram.train('data.txt')

In [17]:
ngram.generate_text()

"is in anaheim , it 's kind of music , but wondering if so bad twitter network ."

In [18]:
ngram.auto_complete("happy")

'happy birthday'

In [19]:
ngram.correction("happy birthda brather")

'happy birthday brother'