# Exercise 1

In [10]:
# Recreate all the calculations of the previous slides for the same corpus
# We define the example corpus from the slides
corpus = [
    "<s> I am Sam </s>",
    "<s> Sam I am </s>",
    "<s> I do not like green eggs and ham </s>"
]

# We tokenize the corpus
tokens = []
for sentence in corpus:
    tokens.extend(sentence.split())

print("Tokens from corpus:", tokens)
print(f"Total tokens: {len(tokens)}")



Tokens from corpus: ['<s>', 'I', 'am', 'Sam', '</s>', '<s>', 'Sam', 'I', 'am', '</s>', '<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']
Total tokens: 20


In [11]:
# We count unigrams (individual words)
from collections import Counter

unigram_counts = Counter(tokens)

print("Word count:")
for word, count in sorted(unigram_counts.items()):
    print(f"  c({word}) = {count}")


Word count:
  c(</s>) = 3
  c(<s>) = 3
  c(I) = 3
  c(Sam) = 2
  c(am) = 2
  c(and) = 1
  c(do) = 1
  c(eggs) = 1
  c(green) = 1
  c(ham) = 1
  c(like) = 1
  c(not) = 1


In [12]:
# We count bigrams (consecutive word pairs)
from collections import defaultdict

bigram_counts = defaultdict(int)
for i in range(len(tokens) - 1):
    bigram = (tokens[i], tokens[i+1])
    bigram_counts[bigram] += 1

print("Bigram counts:")
for bigram, count in sorted(bigram_counts.items()):
    print(f"  c{bigram} = {count}")


Bigram counts:
  c('</s>', '<s>') = 2
  c('<s>', 'I') = 2
  c('<s>', 'Sam') = 1
  c('I', 'am') = 2
  c('I', 'do') = 1
  c('Sam', '</s>') = 1
  c('Sam', 'I') = 1
  c('am', '</s>') = 1
  c('am', 'Sam') = 1
  c('and', 'ham') = 1
  c('do', 'not') = 1
  c('eggs', 'and') = 1
  c('green', 'eggs') = 1
  c('ham', '</s>') = 1
  c('like', 'green') = 1
  c('not', 'like') = 1


In [13]:
# We calculate bigram probabilities: P(w_i | w_{i-1}) = c(w_{i-1}, w_i) / c(w_{i-1})
def bigram_probability(w_prev, w_current):
  
    bigram_count = bigram_counts[(w_prev, w_current)]
    unigram_count = unigram_counts[w_prev]
    if unigram_count == 0:
        return 0
    return bigram_count / unigram_count


In [14]:
# We calculate the probabilities from the slide example
test_cases = [
    ("I", "<s>"),
    ("</s>", "Sam"),
    ("Sam", "<s>"),
    ("Sam", "am"),
    ("am", "I"),
    ("do", "I")
]

print("Bigram probabilities:")
for w_current, w_prev in test_cases:
    prob = bigram_probability(w_prev, w_current)
    print(f"  P({w_current} | {w_prev}) = {bigram_counts[(w_prev, w_current)]}/{unigram_counts[w_prev]} = {prob:.2f}")


Bigram probabilities:
  P(I | <s>) = 2/3 = 0.67
  P(</s> | Sam) = 1/2 = 0.50
  P(Sam | <s>) = 1/3 = 0.33
  P(Sam | am) = 1/2 = 0.50
  P(am | I) = 2/3 = 0.67
  P(do | I) = 1/3 = 0.33
