In [1]:
import pickle
import json
import numpy as np
import random
from collections import Counter, defaultdict
from math import log, log10
import matplotlib.pyplot as plt
import pandas as pd
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

# Load tokenized data from Assignment 1
print("Loading tokenized data from Assignment 1...")
with open('../lab1/gujarati_corpus_tokenized.pkl', 'rb') as f:
    tokenized_data = pickle.load(f)
    
print(f"Loaded {len(tokenized_data)} documents")

Loading tokenized data from Assignment 1...
Loaded 500 documents


In [2]:
# Import and use the Gujarati tokenizer
import sys
sys.path.append('.')  # Add current directory to path
from gujarati_tokenizer import GujaratiTokenizer

# Initialize the tokenizer
gujarati_tokenizer = GujaratiTokenizer()

# Extract all sentences and words for building language models
all_sentences = []
all_words = []
gujarati_sentences = []  # Only sentences with Gujarati words

print("Processing tokenized data with Gujarati tokenizer...")

for doc in tokenized_data:
    for sentence in doc['sentences']:
        words = sentence['words']
        classified_words = sentence['classified_words']
        
        # Re-tokenize and classify using the Gujarati tokenizer for better accuracy
        sentence_text = sentence['text']
        retokenized_words = gujarati_tokenizer.word_tokenize(sentence_text)
        reclassified_words = [(word, gujarati_tokenizer.classify_token(word)) for word in retokenized_words]
        
        # Filter out punctuation and keep meaningful tokens
        meaningful_words = []
        has_gujarati = False
        
        for word, word_type in reclassified_words:
            if word_type in ['gujarati_word', 'english_word', 'integer', 'decimal_number']:
                meaningful_words.append(word.lower())  # Convert to lowercase for consistency
                if word_type == 'gujarati_word':
                    has_gujarati = True
        
        if len(meaningful_words) >= 3:  # Only consider sentences with at least 3 meaningful words
            all_sentences.append(meaningful_words)
            all_words.extend(meaningful_words)
            
            if has_gujarati:  # Keep sentences that have at least one Gujarati word
                gujarati_sentences.append(meaningful_words)

print(f"Total sentences: {len(all_sentences)}")
print(f"Gujarati sentences: {len(gujarati_sentences)}")
print(f"Total words: {len(all_words)}")
print(f"Unique words: {len(set(all_words))}")

# Display sample sentences
print("\nSample sentences:")
for i, sentence in enumerate(gujarati_sentences[:5]):
    print(f"{i+1}. {' '.join(sentence[:10])}...")

# Show token type distribution
print("\nToken type distribution in sample sentences:")
token_type_counts = {}
sample_size = min(1000, len(gujarati_sentences))

for sentence_words in gujarati_sentences[:sample_size]:
    # Join words back to text and re-tokenize to get types
    sentence_text = ' '.join(sentence_words)
    words = gujarati_tokenizer.word_tokenize(sentence_text)
    for word in words:
        word_type = gujarati_tokenizer.classify_token(word)
        if word_type in ['gujarati_word', 'english_word', 'integer', 'decimal_number']:
            token_type_counts[word_type] = token_type_counts.get(word_type, 0) + 1

print("Token types found:")
for token_type, count in sorted(token_type_counts.items()):
    print(f"  {token_type}: {count}")

Processing tokenized data with Gujarati tokenizer...
Total sentences: 1504
Gujarati sentences: 1504
Total words: 21965
Unique words: 8418

Sample sentences:
1. આ વીડિયો જુઓ ઊંઝા માર્કેટયાર્ડ આજથી 25 જુલાઈ સુધી બંધ...
2. મિથેનોલ આવ્યો ક્યાંથી...
3. આખરે ત્રણ રાજ્યોમાં મળેલ હાર પર કોંગ્રેસ અધ્યક્ષ રાહુલ ગાંધી...
4. તેમણે કહ્યું કે ત્રિપુરા નાગાલેન્ડ અને મેઘાલયમાં લોકોના જનાદેશનો સ્વાગત...
5. આ આંકડો માટે અને વજન ઘટાડવા માટે પ્રકાશનનો દિવસ વિતાવવો...

Token type distribution in sample sentences:
Token types found:
  english_word: 56
  gujarati_word: 13939
  integer: 258
Token types found:
  english_word: 56
  gujarati_word: 13939
  integer: 258


## 1. N-gram Language Models

We'll implement four different n-gram models:

In [3]:
class NGramLanguageModel:
    def __init__(self, n=1):
        self.n = n
        self.ngram_counts = Counter()
        self.context_counts = Counter()  # (n-1)-gram counts for conditional probability
        self.vocabulary = set()
        self.vocab_size = 0
        
    def add_sentence_markers(self, sentence):
        """Add start and end markers to sentence"""
        return ['<s>'] * (self.n - 1) + sentence + ['</s>']
    
    def get_ngrams(self, sentence):
        """Extract n-grams from a sentence"""
        marked_sentence = self.add_sentence_markers(sentence)
        ngrams = []
        
        for i in range(len(marked_sentence) - self.n + 1):
            ngram = tuple(marked_sentence[i:i + self.n])
            ngrams.append(ngram)
            
        return ngrams
    
    def train(self, sentences):
        """Train the n-gram model on a list of sentences"""
        print(f"Training {self.n}-gram model on {len(sentences)} sentences...")
        
        # Build vocabulary
        for sentence in sentences:
            self.vocabulary.update(sentence)
        
        # Add special tokens
        self.vocabulary.add('<s>')
        self.vocabulary.add('</s>')
        self.vocab_size = len(self.vocabulary)
        
        # Count n-grams and contexts
        for sentence in sentences:
            ngrams = self.get_ngrams(sentence)
            
            for ngram in ngrams:
                self.ngram_counts[ngram] += 1
                
                if self.n > 1:
                    context = ngram[:-1]  # (n-1)-gram context
                    self.context_counts[context] += 1
        
        print(f"Vocabulary size: {self.vocab_size}")
        print(f"Total {self.n}-grams: {sum(self.ngram_counts.values())}")
        print(f"Unique {self.n}-grams: {len(self.ngram_counts)}")
        
    def probability(self, ngram, smoothing='none', k=1):
        """Calculate probability of an n-gram with optional smoothing"""
        if isinstance(ngram, list):
            ngram = tuple(ngram)
            
        if self.n == 1:
            # Unigram probability
            count = self.ngram_counts[ngram]
            total = sum(self.ngram_counts.values())
            
            if smoothing == 'add_one':
                return (count + 1) / (total + self.vocab_size)
            elif smoothing == 'add_k':
                return (count + k) / (total + k * self.vocab_size)
            elif smoothing == 'add_token_type':
                unique_types = len(self.ngram_counts)
                return (count + 1) / (total + unique_types)
            else:
                return count / total if total > 0 else 0
        else:
            # Higher-order n-gram probability P(w_n | w_1...w_{n-1})
            context = ngram[:-1]
            count = self.ngram_counts[ngram]
            context_count = self.context_counts[context]
            
            if smoothing == 'add_one':
                return (count + 1) / (context_count + self.vocab_size)
            elif smoothing == 'add_k':
                return (count + k) / (context_count + k * self.vocab_size)
            elif smoothing == 'add_token_type':
                unique_types = len([ng for ng in self.ngram_counts.keys() if ng[:-1] == context])
                if unique_types == 0:
                    unique_types = 1
                return (count + 1) / (context_count + unique_types)
            else:
                return count / context_count if context_count > 0 else 0
    
    def sentence_probability(self, sentence, smoothing='none', k=1):
        """Calculate probability of a sentence"""
        ngrams = self.get_ngrams(sentence)
        log_prob = 0
        
        for ngram in ngrams:
            prob = self.probability(ngram, smoothing, k)
            if prob > 0:
                log_prob += log10(prob)
            else:
                return float('-inf')  # Zero probability
                
        return log_prob
    
    def perplexity(self, test_sentences, smoothing='none', k=1):
        """Calculate perplexity on test sentences"""
        total_log_prob = 0
        total_words = 0
        
        for sentence in test_sentences:
            log_prob = self.sentence_probability(sentence, smoothing, k)
            if log_prob != float('-inf'):
                total_log_prob += log_prob
                total_words += len(sentence) + 1  # +1 for </s> token
        
        if total_words == 0:
            return float('inf')
            
        avg_log_prob = total_log_prob / total_words
        return 10 ** (-avg_log_prob)

In [None]:
# Train all four n-gram models
models = {}
n_values = [1, 2, 3, 4]
model_names = ['Unigram', 'Bigram', 'Trigram', 'Quadrigram']

print("Training n-gram models...\n")

for n, name in zip(n_values, model_names):
    print(f"\n{'-'*50}")
    print(f"Training {name} Model (n={n})")
    print(f"{'-'*50}")
    
    model = NGramLanguageModel(n=n)
    model.train(gujarati_sentences)
    models[name] = model
    
    # Show some example n-grams
    print(f"\nMost common {name.lower()}s:")
    for ngram, count in model.ngram_counts.most_common(10):
        print(f"  {' '.join(ngram)}: {count}")

print("\n" + "="*60)
print("All models trained successfully!")
print("="*60)

Training n-gram models...


--------------------------------------------------
Training Unigram Model (n=1)
--------------------------------------------------
Training 1-gram model on 1504 sentences...
Vocabulary size: 8420
Total 1-grams: 23469
Unique 1-grams: 8419

Most common unigrams:
  </s>: 1504
  છે: 937
  અને: 445
  આ: 276
  પણ: 220
  કે: 213
  માટે: 206
  કરી: 142
  પર: 131
  જ: 127

--------------------------------------------------
Training Bigram Model (n=2)
--------------------------------------------------
Training 2-gram model on 1504 sentences...
Vocabulary size: 8420
Total 2-grams: 23469
Unique 2-grams: 19100

Most common bigrams:
  છે </s>: 607
  <s> આ: 125
  હતી </s>: 85
  હતો </s>: 62
  છે અને: 59
  છે કે: 59
  હતા </s>: 51
  શકે છે: 48
  કરે છે: 45
  હતું </s>: 38

--------------------------------------------------
Training Trigram Model (n=3)
--------------------------------------------------
Training 3-gram model on 1504 sentences...
Vocabulary size: 8420
Total 3-

## 2. Smoothing Techniques

Now let's test different smoothing techniques:

In [5]:
# Test smoothing techniques with sample sentences
test_sentences = [
    ['આજે', 'હું', 'શાળાએ', 'ગયો'],
    ['મારું', 'નામ', 'અર્જુન', 'છે'],
    ['ગુજરાત', 'એક', 'સુંદર', 'રાજ્ય', 'છે']
]

smoothing_techniques = {
    'No Smoothing': ('none', 1),
    'Add-One Smoothing': ('add_one', 1),
    'Add-K Smoothing (k=0.5)': ('add_k', 0.5),
    'Add Token Type Smoothing': ('add_token_type', 1)
}

print("Testing Smoothing Techniques")
print("="*80)

for sentence in test_sentences:
    print(f"\nSentence: {' '.join(sentence)}")
    print("-" * 60)
    
    for model_name, model in models.items():
        print(f"\n{model_name} Model:")
        
        for smooth_name, (smooth_type, k) in smoothing_techniques.items():
            prob = model.sentence_probability(sentence, smoothing=smooth_type, k=k)
            if prob == float('-inf'):
                print(f"  {smooth_name:25}: -∞ (zero probability)")
            else:
                print(f"  {smooth_name:25}: {prob:.6f}")

Testing Smoothing Techniques

Sentence: આજે હું શાળાએ ગયો
------------------------------------------------------------

Unigram Model:
  No Smoothing             : -∞ (zero probability)
  Add-One Smoothing        : -15.678855
  Add-K Smoothing (k=0.5)  : -15.717665
  Add Token Type Smoothing : -15.678787

Bigram Model:
  No Smoothing             : -∞ (zero probability)
  Add-One Smoothing        : -18.320454
  Add-K Smoothing (k=0.5)  : -17.925823
  Add Token Type Smoothing : -6.278849

Trigram Model:
  No Smoothing             : -∞ (zero probability)
  Add-One Smoothing        : -18.619321
  Add-K Smoothing (k=0.5)  : -18.398624
  Add Token Type Smoothing : -3.647774

Quadrigram Model:
  No Smoothing             : -∞ (zero probability)
  Add-One Smoothing        : -18.619321
  Add-K Smoothing (k=0.5)  : -18.398624
  Add Token Type Smoothing : -3.647774

Sentence: મારું નામ અર્જુન છે
------------------------------------------------------------

Unigram Model:
  No Smoothing            

## 3. Generate Test Sentences and Compute Probabilities

Since we don't have access to external news articles, we'll create test sentences by:
1. Sampling random sentences from our corpus
2. Creating variations of existing sentences
3. Generating some out-of-vocabulary test cases

In [6]:
# Generate test sentences
def generate_test_sentences(sentences, num_sentences=1000):
    """Generate test sentences for evaluation"""
    test_sentences = []
    
    # 1. Random sampling from corpus (70%)
    random_sample = random.sample(sentences, min(700, len(sentences)))
    test_sentences.extend(random_sample)
    
    # 2. Sentence variations - shuffle words (20%)
    variations = []
    for _ in range(200):
        original = random.choice(sentences)
        if len(original) > 2:
            shuffled = original.copy()
            random.shuffle(shuffled)
            variations.append(shuffled)
    test_sentences.extend(variations)
    
    # 3. Partial sentences (10%)
    partial_sentences = []
    for _ in range(100):
        original = random.choice(sentences)
        if len(original) > 3:
            # Take random substring
            start = random.randint(0, len(original) - 3)
            end = random.randint(start + 2, len(original))
            partial_sentences.append(original[start:end])
    test_sentences.extend(partial_sentences)
    
    return test_sentences[:num_sentences]

# Generate test sentences
print("Generating test sentences...")
test_sentences = generate_test_sentences(gujarati_sentences, 1000)
print(f"Generated {len(test_sentences)} test sentences")

# Show some examples
print("\nSample test sentences:")
for i, sentence in enumerate(test_sentences[:10]):
    print(f"{i+1:2d}. {' '.join(sentence[:8])}{'...' if len(sentence) > 8 else ''}")

Generating test sentences...
Generated 998 test sentences

Sample test sentences:
 1. ઉલ્લેખનીય છે કે હાલમાં આ વિસ્તારના કાયદા અને...
 2. બાકી તો પાડા પર પાણી રેડવા સમાન ચામડીના...
 3. આમ સરકારી હોસ્પિટલમાં પણ જો યોગ્ય સારવાર ન...
 4. જોર્જ ફ્લોયડની ઘટનાઓ
 5. આપણાં શાસ્ત્રોમાં ધનતેરસ વિશે ઘણીબધી ચર્ચા કરવામાં આવેલી...
 6. જો તમને ખાતરી નથી કે તમારી લોન એસબીએ...
 7. યુનાઇટેડ સ્ટેટ્સ બ્યુરો ઓફ લેબર સ્ટેટિસ્ટિક્સ અનુસાર 2008...
 8. આવી રીતે માછીમારો પોતાના પગ ઉપર જ કુહાડો...
 9. સબસ્ટ્રેટને મદદથી માળ સપાટી અસરકારક કામગીરી દરમિયાન અસર...
10. નવી સેન્ટ્રોમા રિયલ એસી પણ આપવામાં આવ્યુ છે


In [7]:
# Compute probabilities for all test sentences with different models and smoothing
def evaluate_models_on_test_set(models, test_sentences, smoothing_techniques):
    """Evaluate all models on test sentences with different smoothing techniques"""
    
    results = defaultdict(lambda: defaultdict(list))
    
    print(f"Evaluating models on {len(test_sentences)} test sentences...")
    
    total_evaluations = len(models) * len(smoothing_techniques) * len(test_sentences)
    current_eval = 0
    
    for model_name, model in models.items():
        print(f"\nEvaluating {model_name} model...")
        
        for smooth_name, (smooth_type, k) in smoothing_techniques.items():
            sentence_probs = []
            
            for i, sentence in enumerate(test_sentences):
                prob = model.sentence_probability(sentence, smoothing=smooth_type, k=k)
                sentence_probs.append(prob)
                
                current_eval += 1
                if current_eval % 1000 == 0:
                    print(f"  Progress: {current_eval}/{total_evaluations} ({current_eval/total_evaluations*100:.1f}%)")
            
            results[model_name][smooth_name] = sentence_probs
            
            # Calculate statistics
            valid_probs = [p for p in sentence_probs if p != float('-inf')]
            zero_prob_count = len(sentence_probs) - len(valid_probs)
            
            if valid_probs:
                avg_log_prob = np.mean(valid_probs)
                std_log_prob = np.std(valid_probs)
                print(f"    {smooth_name}: Avg log prob = {avg_log_prob:.4f}, Std = {std_log_prob:.4f}, Zero probs = {zero_prob_count}")
            else:
                print(f"    {smooth_name}: All probabilities are zero!")
    
    return results

# Run evaluation
print("Starting comprehensive evaluation...")
results = evaluate_models_on_test_set(models, test_sentences, smoothing_techniques)
print("\nEvaluation completed!")

Starting comprehensive evaluation...
Evaluating models on 998 test sentences...

Evaluating Unigram model...
    No Smoothing: Avg log prob = -48.5552, Std = 32.7218, Zero probs = 0
  Progress: 1000/15968 (6.3%)
    No Smoothing: Avg log prob = -48.5552, Std = 32.7218, Zero probs = 0
  Progress: 1000/15968 (6.3%)
    Add-One Smoothing: Avg log prob = -48.7849, Std = 32.7225, Zero probs = 0
  Progress: 2000/15968 (12.5%)
    Add-One Smoothing: Avg log prob = -48.7849, Std = 32.7225, Zero probs = 0
  Progress: 2000/15968 (12.5%)
    Add-K Smoothing (k=0.5): Avg log prob = -48.6293, Std = 32.6851, Zero probs = 0
  Progress: 3000/15968 (18.8%)
    Add-K Smoothing (k=0.5): Avg log prob = -48.6293, Std = 32.6851, Zero probs = 0
  Progress: 3000/15968 (18.8%)
    Add Token Type Smoothing: Avg log prob = -48.7847, Std = 32.7224, Zero probs = 0

Evaluating Bigram model...
  Progress: 4000/15968 (25.1%)
    No Smoothing: Avg log prob = -12.0746, Std = 7.1155, Zero probs = 288
  Progress: 5000/15