Apply these steps on a small regional language dataset
1. Implement & analyze word level analysis using BiGram & Tri-gram models
2. Evaluate the performance of the models in terms of accuracy & coherence

In [10]:
from nltk.util import ngrams
from collections import Counter
from nltk.tokenize import word_tokenize
import math
import random

# Step 1: Download required NLTK resources (if you haven't already)
# nltk.download('punkt')

# Sample Marathi text
text = "नैसर्गिक भाषा प्रक्रिया संगणकांना मानवी भाषा समजावून देण्याची क्षमता देते."

# Step 2: Tokenize the text (Word Tokenization)
tokens = word_tokenize(text.lower())  # Convert to lowercase for consistency

# Step 3: Generate Bi-grams and Tri-grams
bigrams = ngrams(tokens, 2)
trigrams = ngrams(tokens, 3)

# Step 4: Count Bi-grams and Tri-grams
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

# Step 5: Additive Smoothing for Probabilities
def bigram_prob_smooth(bigram, bigram_counts, vocab_size):
    # Laplace smoothing: P(w2|w1) = (Count(w1, w2) + 1) / (Count(w1) + vocab_size)
    return (bigram_counts[bigram] + 1) / (sum(bigram_counts.values()) + vocab_size)

def trigram_prob_smooth(trigram, trigram_counts, bigram_counts, vocab_size):
    # Laplace smoothing for trigram: P(w3|w1,w2) = (Count(w1,w2,w3) + 1) / (Count(w1,w2) + vocab_size)
    return (trigram_counts[trigram] + 1) / (bigram_counts[(trigram[0], trigram[1])] + vocab_size)

# Vocabulary size (total unique words)
vocab_size = len(set(tokens))

# Step 6: Generate a Random Sentence from Tri-gram Model with smoothing
def generate_sentence_smooth(trigram_model, bigram_model, trigram_counts, bigram_counts, vocab_size):
    sentence = [tokens[0]]  # Start with the first word
    while len(sentence) < 10:  # Limit to 10 words for simplicity
        next_word_candidates = [
            trigram for trigram in trigram_counts if trigram[0] == sentence[-1]
        ]
        if next_word_candidates:
            next_word = random.choice(next_word_candidates)[2]
            sentence.append(next_word)
        else:
            break
    return ' '.join(sentence)

# Generate a sentence from trigram model with smoothing
generated_sentence = generate_sentence_smooth(trigrams, bigrams, trigram_counts, bigram_counts, vocab_size)
print(f"Generated Sentence (with smoothing): {generated_sentence}")

# Step 7: Calculate Perplexity with Smoothing for Bi-gram and Tri-gram models
def calculate_perplexity_smooth(ngrams, ngram_counts, ngram_prob_func, vocab_size):
    log_likelihood = 0
    total_ngrams = sum(ngram_counts.values())  # Total count of all n-grams

    for ngram in ngrams:
        try:
            log_likelihood += math.log2(ngram_prob_func(ngram, ngram_counts, vocab_size))  # Log probability for each n-gram
        except KeyError:
            # If n-gram doesn't exist in the model, handle unseen n-grams
            log_likelihood += math.log2(1 / (total_ngrams + vocab_size))  # Assume unseen n-grams have a small probability
    perplexity = 2 ** (-log_likelihood / total_ngrams)  # Calculate perplexity
    return perplexity

# Calculate Perplexity for Bi-gram and Tri-gram models with smoothing
bigram_perplexity = calculate_perplexity_smooth(bigrams, bigram_counts, bigram_prob_smooth, vocab_size)
trigram_perplexity = calculate_perplexity_smooth(trigrams, trigram_counts, trigram_prob_smooth, vocab_size)

print(f"Perplexity for Bi-gram Model (with smoothing): {bigram_perplexity}")
print(f"Perplexity for Tri-gram Model (with smoothing): {trigram_perplexity}")

Generated Sentence (with smoothing): नैसर्गिक प्रक्रिया मानवी समजावून क्षमता .
Perplexity for Bi-gram Model (with smoothing): 1.0
Perplexity for Tri-gram Model (with smoothing): 1.0
