<a href="https://colab.research.google.com/github/adnanagbaria/LLMs/blob/main/ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

An n-gram is a contiguous sequence of $n$ items (usually words) from a given text.

In [2]:
# prompt: I want to build n-gram model

# N-gram models predict the next word in a sequence based on the previous n-1 words.
# This is a basic implementation focusing on building n-gram counts.

from collections import defaultdict

def generate_ngrams(text, n):
  """
  Generates n-grams and their counts from a given text.

  Args:
    text: The input text as a string.
    n: The size of the n-grams (e.g., 2 for bigrams, 3 for trigrams).

  Returns:
    A defaultdict where keys are tuples representing n-grams
    and values are their counts.
  """
  words = text.lower().split()
  ngrams = defaultdict(int)
  for i in range(len(words) - n + 1):
    ngram = tuple(words[i:i + n])
    ngrams[ngram] += 1
  return ngrams

# Example Usage:
text = "This is a simple example of building an n-gram model."
n = 2  # Build bigrams

ngram_counts = generate_ngrams(text, n)

print(f"{n}-gram counts:")
for ngram, count in ngram_counts.items():
  print(f"{ngram}: {count}")

n = 3 # Build trigrams

ngram_counts = generate_ngrams(text, n)

print(f"\n{n}-gram counts:")
for ngram, count in ngram_counts.items():
  print(f"{ngram}: {count}")


2-gram counts:
('this', 'is'): 1
('is', 'a'): 1
('a', 'simple'): 1
('simple', 'example'): 1
('example', 'of'): 1
('of', 'building'): 1
('building', 'an'): 1
('an', 'n-gram'): 1
('n-gram', 'model.'): 1

3-gram counts:
('this', 'is', 'a'): 1
('is', 'a', 'simple'): 1
('a', 'simple', 'example'): 1
('simple', 'example', 'of'): 1
('example', 'of', 'building'): 1
('of', 'building', 'an'): 1
('building', 'an', 'n-gram'): 1
('an', 'n-gram', 'model.'): 1
