In [None]:
import re
from collections import defaultdict, Counter
import math

class NgramLanguageModel:
    def __init__(self, n):
        self.n = n
        self.model = defaultdict(Counter)
        self.vocab = set()
        
    def preprocess(self, text):
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
        tokens = text.split()
        return tokens
    
    def train(self, corpus):
        tokenized_lines = []
        word_counts = Counter()
        
        # Preprocess each line and update counts
        for line in corpus:
            tokens = self.preprocess(line)
            tokenized_lines.append(tokens)
            word_counts.update(tokens)
        
        # Handle OOV by converting words that appear < 3 times to <UNK>
        for word in word_counts:
            if word_counts[word] < 3:
                self.vocab.add('<UNK>')
            else:
                self.vocab.add(word)
                
        # Train n-gram model
        for tokens in tokenized_lines:
            tokens = ['<START>'] * (self.n - 1) + [token if token in self.vocab else '<UNK>' for token in tokens] + ['<STOP>']
            for i in range(len(tokens) - self.n + 1):
                context = tuple(tokens[i:i + self.n - 1])
                word = tokens[i + self.n - 1]
                self.model[context][word] += 1
        
    def get_ngram_prob(self, context, word):
        context_count = sum(self.model[context].values())
        word_count = self.model[context][word]
        return word_count / context_count if context_count > 0 else 0.0

    def calculate_perplexity(self, corpus):
        total_log_prob = 0
        total_tokens = 0
        
        for line in corpus:
            tokens = ['<START>'] * (self.n - 1) + [token if token in self.vocab else '<UNK>' for token in self.preprocess(line)] + ['<STOP>']
            total_tokens += len(tokens) - (self.n - 1)
            
            for i in range(len(tokens) - self.n + 1):
                context = tuple(tokens[i:i + self.n - 1])
                word = tokens[i + self.n - 1]
                prob = self.get_ngram_prob(context, word)
                if prob > 0:
                    total_log_prob += math.log(prob)
                else:
                    return float('inf')
                    
        avg_log_prob = total_log_prob / total_tokens
        perplexity = math.exp(-avg_log_prob)
        return perplexity

# Read data files
def load_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        return file.readlines()

if __name__ == "__main__":
    train_file = '1b_benchmark.train.tokens'
    dev_file = '1b_benchmark.dev.tokens'
    test_file = '1b_benchmark.test.tokens'

    # Load data
    train_corpus = load_corpus(train_file)
    dev_corpus = load_corpus(dev_file)
    test_corpus = load_corpus(test_file)

    # Train and evaluate models for unigram, bigram, and trigram
    for n in [1, 2, 3]:
        print(f"Training {n}-gram model...")
        model = NgramLanguageModel(n)
        model.train(train_corpus)

        train_perplexity = model.calculate_perplexity(train_corpus)
        dev_perplexity = model.calculate_perplexity(dev_corpus)
        test_perplexity = model.calculate_perplexity(test_corpus)

        print(f"{n}-gram Training Perplexity: {train_perplexity}")
        print(f"{n}-gram Development Perplexity: {dev_perplexity}")
        print(f"{n}-gram Test Perplexity: {test_perplexity}")


Training 1-gram model...
1-gram Training Perplexity: 1129.0007325805495
1-gram Development Perplexity: 1038.4668779311676
1-gram Test Perplexity: 1037.8079658855668
Training 2-gram model...
2-gram Training Perplexity: 98.74207175896068
2-gram Development Perplexity: inf
2-gram Test Perplexity: inf
Training 3-gram model...
3-gram Training Perplexity: 8.058281666260303
3-gram Development Perplexity: inf
3-gram Test Perplexity: inf
