In [None]:
# for counting n-grams
from collections import Counter, defaultdict

# Read tokenied text file and split each line into tokens
def load_corpus(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    corpus = [line.strip().split() for line in lines if line.strip()]
    return corpus

# Build frequency counts for unigrams, bigrams, trigrams, and quadrigrams
def build_ngram_models(corpus):
    # Build unigram model - count each token
    unigram_counts = Counter()
    for sentence in corpus:
        for token in sentence:
            unigram_counts[token] += 1
    
    # Build bigram model - count pairs of tokens with sentence boundaries (<s>, </s>)
    bigram_counts = defaultdict(int)
    for sentence in corpus:
        tokens = ['<s>'] + sentence + ['</s>']
        for i in range(len(tokens) - 1):
            bigram = (tokens[i], tokens[i+1])
            bigram_counts[bigram] += 1
    
    # Build trigram model
    trigram_counts = defaultdict(int)
    for sentence in corpus:
        tokens = ['<s>'] + sentence + ['</s>']
        for i in range(len(tokens) - 2):
            trigram = tuple(tokens[i:i+3])
            trigram_counts[trigram] += 1
    
    # Build quadrigram model
    quadrigram_counts = defaultdict(int)
    for sentence in corpus:
        tokens = ['<s>'] + sentence + ['</s>']
        for i in range(len(tokens) - 3):
            quadrigram = tuple(tokens[i:i+4])
            quadrigram_counts[quadrigram] += 1
    
    return unigram_counts, bigram_counts, trigram_counts, quadrigram_counts

# Smoothing functions
def add_one_smoothing(count, context_count, vocab_size):
    return (count + 1) / (context_count + vocab_size)

def add_k_smoothing(count, context_count, vocab_size, k):
    return (count + k) / (context_count + k * vocab_size)

def token_type_smoothing(count, context_count, vocab_size):
    return (count + vocab_size) / (context_count + vocab_size * vocab_size)

# Calculate unigram probabilities
def get_unigram_prob(word, unigram_counts, vocab_size, smoothing='none', k=1):
    count = unigram_counts[word]
    total = sum(unigram_counts.values())
    
    if smoothing == 'add_one':
        return add_one_smoothing(count, total, vocab_size)
    elif smoothing == 'add_k':
        return add_k_smoothing(count, total, vocab_size, k)
    elif smoothing == 'token_type':
        return token_type_smoothing(count, total, vocab_size)
    else:
        return count / total if total > 0 else 0

# Calculate bigram probabilities
def get_bigram_prob(bigram, bigram_counts, unigram_counts, vocab_size, smoothing='none', k=1):
    count = bigram_counts[bigram]
    context_count = unigram_counts[bigram[0]]
    
    if smoothing == 'add_one':
        return add_one_smoothing(count, context_count, vocab_size)
    elif smoothing == 'add_k':
        return add_k_smoothing(count, context_count, vocab_size, k)
    elif smoothing == 'token_type':
        return token_type_smoothing(count, context_count, vocab_size)
    else:
        return count / context_count if context_count > 0 else 0

# Calculate trigram probabilities
def get_trigram_prob(trigram, trigram_counts, bigram_counts, vocab_size, smoothing='none', k=1):
    count = trigram_counts[trigram]
    context = (trigram[0], trigram[1])
    context_count = bigram_counts[context]
    
    if smoothing == 'add_one':
        return add_one_smoothing(count, context_count, vocab_size)
    elif smoothing == 'add_k':
        return add_k_smoothing(count, context_count, vocab_size, k)
    elif smoothing == 'token_type':
        return token_type_smoothing(count, context_count, vocab_size)
    else:
        return count / context_count if context_count > 0 else 0

# Calculate quadrigram probabilities
def get_quadrigram_prob(quadrigram, quadrigram_counts, trigram_counts, vocab_size, smoothing='none', k=1):
    count = quadrigram_counts[quadrigram]
    context = (quadrigram[0], quadrigram[1], quadrigram[2])
    context_count = trigram_counts[context]
    
    if smoothing == 'add_one':
        return add_one_smoothing(count, context_count, vocab_size)
    elif smoothing == 'add_k':
        return add_k_smoothing(count, context_count, vocab_size, k)
    elif smoothing == 'token_type':
        return token_type_smoothing(count, context_count, vocab_size)
    else:
        return count / context_count if context_count > 0 else 0

def save_model_to_file(model_counts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for ngram, count in model_counts.items():
            if isinstance(ngram, tuple):
                ngram_str = ' '.join(ngram)
            else:
                ngram_str = str(ngram)
            f.write(f"{ngram_str}\t{count}\n")

# Save smoothing results
def save_smoothing_results(bigram_counts, unigram_counts, vocab_size):
    k = 0.3
    with open('smoothing_output.txt', 'w', encoding='utf-8') as f:
        f.write("Bigram Language Model Smoothing Results\n")
        f.write("Formulas:\n")
        f.write("1. Add-One: P(w2|w1) = (count(w1,w2) + 1) / (count(w1) + V)\n")
        f.write(f"2. Add-K: P(w2|w1) = (count(w1,w2) + {k}) / (count(w1) + {k}*V)\n")
        f.write("3. Token Type: P(w2|w1) = (count(w1,w2) + V) / (count(w1) + VÂ²)\n\n")
        f.write("Bigram\tCount\tContextCount\tAddOne\tAddK\tTokenType\n")
        
        for bigram in bigram_counts:
            count = bigram_counts[bigram]
            context_count = unigram_counts[bigram[0]]
            
            p_add_one = add_one_smoothing(count, context_count, vocab_size)
            p_add_k = add_k_smoothing(count, context_count, vocab_size, k)
            p_token_type = token_type_smoothing(count, context_count, vocab_size)
            
            f.write(f"{bigram}\t{count}\t{context_count}\t{p_add_one:.6f}\t{p_add_k:.6f}\t{p_token_type:.6f}\n")

def main():
    filepath = 'tokenized_bengali.txt'
    
    # Load corpus
    corpus = load_corpus(filepath)
    print(f"Loaded {len(corpus)} sentences")
    
    # Build all models
    unigram_counts, bigram_counts, trigram_counts, quadrigram_counts = build_ngram_models(corpus)
    
    # Get vocabulary
    vocab = set(token for sentence in corpus for token in sentence)
    vocab_size = len(vocab)
    
    print(f"Built all n-gram models")
    print(f"   Vocabulary size: {vocab_size}")
    print(f"   Unigrams: {len(unigram_counts)}")
    print(f"   Bigrams: {len(bigram_counts)}")
    print(f"   Trigrams: {len(trigram_counts)}")
    print(f"   Quadrigrams: {len(quadrigram_counts)}")
    
    save_model_to_file(unigram_counts, "unigram_model.txt")
    save_model_to_file(bigram_counts, "bigram_model.txt")
    save_model_to_file(trigram_counts, "trigram_model.txt")
    save_model_to_file(quadrigram_counts, "quadrigram_model.txt")
    print("All models saved to files")
    
    # Save smoothing results
    save_smoothing_results(bigram_counts, unigram_counts, vocab_size)
    print("Smoothing results saved to 'smoothing_output.txt'")

if __name__ == "__main__":
    main()

Loaded 1914 sentences
Built all n-gram models
   Vocabulary size: 8114
   Unigrams: 8114
   Bigrams: 21574
   Trigrams: 23163
   Quadrigrams: 21914
All models saved to files
Smoothing results saved to 'smoothing_output.txt'
