## N-grams

In [22]:
import collections

In [23]:
def generate_ngrams(text, n):
    words = text.split()
    if len(words) < n:
        return []
    ngrams = []
    for i in range(len(words) - n + 1):
        ngrams.append(tuple(words[i:i+n]))
    return ngrams

In [24]:
sample_text = "This is a simple example of an n-gram model. This model predicts the next word."
n = 2
print(f"Generating {n}-grams for the text: '{sample_text}'\n")
ngrams = generate_ngrams(sample_text, n)
print(f"Generated {n}-grams: {ngrams}")

Generating 2-grams for the text: 'This is a simple example of an n-gram model. This model predicts the next word.'

Generated 2-grams: [('This', 'is'), ('is', 'a'), ('a', 'simple'), ('simple', 'example'), ('example', 'of'), ('of', 'an'), ('an', 'n-gram'), ('n-gram', 'model.'), ('model.', 'This'), ('This', 'model'), ('model', 'predicts'), ('predicts', 'the'), ('the', 'next'), ('next', 'word.')]


In [25]:
def build_ngram_model(text, n):
    ngrams = generate_ngrams(text, n)
    model = collections.defaultdict(lambda: collections.defaultdict(int))
    for i in range(len(ngrams) - 1):
        prefix = ngrams[i]
        suffix = ngrams[i+1][-1]
        model[prefix][suffix] += 1
    return model

In [26]:
print(f"\nBuilding {n}-gram model...")
ngram_model = build_ngram_model(sample_text, n)
print(f"N-gram model: {ngram_model}")


Building 2-gram model...
N-gram model: defaultdict(<function build_ngram_model.<locals>.<lambda> at 0x7f201c3b2e80>, {('This', 'is'): defaultdict(<class 'int'>, {'a': 1}), ('is', 'a'): defaultdict(<class 'int'>, {'simple': 1}), ('a', 'simple'): defaultdict(<class 'int'>, {'example': 1}), ('simple', 'example'): defaultdict(<class 'int'>, {'of': 1}), ('example', 'of'): defaultdict(<class 'int'>, {'an': 1}), ('of', 'an'): defaultdict(<class 'int'>, {'n-gram': 1}), ('an', 'n-gram'): defaultdict(<class 'int'>, {'model.': 1}), ('n-gram', 'model.'): defaultdict(<class 'int'>, {'This': 1}), ('model.', 'This'): defaultdict(<class 'int'>, {'model': 1}), ('This', 'model'): defaultdict(<class 'int'>, {'predicts': 1}), ('model', 'predicts'): defaultdict(<class 'int'>, {'the': 1}), ('predicts', 'the'): defaultdict(<class 'int'>, {'next': 1}), ('the', 'next'): defaultdict(<class 'int'>, {'word.': 1})})


In [27]:
def predict_next_word(model, prefix):
    if prefix not in model:
        return None
    next_words = model[prefix]
    total_count = sum(next_words.values())
    if total_count == 0:
        return None
    probabilities = {word: count / total_count for word, count in next_words.items()}
    return probabilities

In [28]:
test_prefix = ('This', 'is')
print(f"\nPredicting next word for prefix {test_prefix}:")
predictions = predict_next_word(ngram_model, test_prefix)
print(f"Predictions: {predictions}")


Predicting next word for prefix ('This', 'is'):
Predictions: {'a': 1.0}


In [29]:

test_prefix_2 = ('the', 'next')
print(f"\nPredicting next word for prefix {test_prefix_2}:")
predictions_2 = predict_next_word(ngram_model, test_prefix_2)
print(f"Predictions: {predictions_2}")


Predicting next word for prefix ('the', 'next'):
Predictions: {'word.': 1.0}
