# S04 - N-Gram Language Models
## Exercises

### Exercise 1 (Easy)
Generate all bigrams and trigrams from a sentence.

In [2]:
sentence = "I love natural language processing"

def get_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngrams.append(ngram)
    return ngrams

tokens = sentence.lower().split()
# Get bigrams and trigrams
bigrams = get_ngrams(tokens, 2)
trigrams = get_ngrams(tokens, 3)    
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)

Bigrams: [('i', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing')]
Trigrams: [('i', 'love', 'natural'), ('love', 'natural', 'language'), ('natural', 'language', 'processing')]


### Exercise 2 (Easy)
Calculate the probability of a bigram using Maximum Likelihood Estimation (MLE).

In [1]:
corpus = ["i love nlp", "i love python", "i hate bugs", "nlp is great"]

# Calculate P(love | i) using MLE
# P(love | i) = count(i, love) / count(i)
from collections import Counter, defaultdict
unigram_counts = Counter()
bigram_counts = defaultdict(int)
for sentence in corpus:
    tokens = sentence.split()
    for i in range(len(tokens)):
        unigram_counts[tokens[i]] += 1
        if i < len(tokens) - 1:
            bigram_counts[(tokens[i], tokens[i+1])] += 1
count_i_love = bigram_counts[("i", "love")]
count_i = unigram_counts["i"]
p_love_given_i = count_i_love / count_i if count_i > 0 else 0
print(f"P(love | i) = {count_i_love}/{count_i} = {p_love_given_i:.2f}")


P(love | i) = 2/3 = 0.67


### Exercise 3 (Medium)
Build a bigram language model and use it to predict the next word.

In [2]:
# Build a bigram language model and use it to predict the next word.
from collections import defaultdict

def build_bigram_model(corpus):
    model = defaultdict(lambda: defaultdict(int))
    for sentence in corpus:
        tokens = sentence.split()
        for i in range(len(tokens) - 1):
            model[tokens[i]][tokens[i+1]] += 1
    return model

def predict_next(model, word):
    if word in model:
        next_words = model[word]
        predicted_word = max(next_words, key=next_words.get)
        return predicted_word
    else:
        return None
    
corpus = "the cat sat on the mat . the dog sat on the rug . the cat is on the mat ."

bigram_model = build_bigram_model(corpus.split('.'))
predicted_word = predict_next(bigram_model, "the")
print(f"Predicted next word after 'the': {predicted_word}")

Predicted next word after 'the': cat


### Exercise 4 (Medium)
Implement Add-1 (Laplace) smoothing for your bigram model.

In [None]:
def build_bigram_model_smoothed(corpus, vocab_size):
    # Implement Add-1 smoothing
    # P(w2|w1) = (count(w1,w2) + 1) / (count(w1) + V)
    pass


### Exercise 5 (Hard)
Calculate perplexity of your language model on a test sentence.

*Perplexity = exp(-1/N * Î£ log P(wi|wi-1))*

In [None]:
import math

def calculate_perplexity(model, test_sentence):
    # Your implementation
    pass

test = "the cat sat on the rug"
# Calculate perplexity