In [None]:
import re
from collections import Counter, defaultdict
import math

# Simple tokenizer function
def simple_tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())  # Extract words ignoring punctuation

# Sample corpus
corpus = """Natural language processing enables computers to understand and generate human language.
It involves various techniques like tokenization, parsing, and machine learning."""

# Tokenize the corpus
tokens = simple_tokenize(corpus)

# Generate bigrams
bigram_list = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]

# Count unigrams and bigrams
unigram_counts = Counter(tokens)
bigram_counts = Counter(bigram_list)

In [None]:
# Vocabulary size (unique words in corpus)
V = len(set(tokens))

# Calculate bigram probabilities with Add-One Smoothing
bigram_probabilities = defaultdict(float)
for (w1, w2), count in bigram_counts.items():
    bigram_probabilities[(w1, w2)] = (count + 1) / (unigram_counts[w1] + V)  # Add-One Smoothing Formula

# Function to calculate probability of a sentence with Add-One Smoothing
def sentence_probability(sentence):
    words = simple_tokenize(sentence)
    bigrams_in_sentence = [(words[i], words[i+1]) for i in range(len(words)-1)]

    probability = 1.0
    for bigram in bigrams_in_sentence:
        w1, w2 = bigram
        prob = (bigram_counts.get(bigram, 0) + 1) / (unigram_counts.get(w1, 0) + V)  # Add-One Smoothing
        probability *= prob

    return probability

# Example sentence
sentence = "Natural language processing involves machine learning"
probability = sentence_probability(sentence)

print(f"Probability of sentence with Add-One Smoothing: {probability:.20f}")


Probability of sentence with Add-One Smoothing: 0.00000238095238095238
