In [6]:
import sys
from collections import defaultdict,Counter
import math
import random
import os
import os.path
from nltk import ngrams


def corpus_reader(corpusfile, lexicon=None): 
    with open(corpusfile,'r') as corpus: 
        for line in corpus: 
            if line.strip():
                sequence = line.lower().strip().split()
                if lexicon: 
                    yield [word if word in lexicon else "UNK" for word in sequence]
                else: 
                    yield sequence

def get_lexicon(corpus):
    word_counts = defaultdict(int)
    for sentence in corpus:
        for word in sentence: 
            word_counts[word] += 1
    return set(word for word in word_counts if word_counts[word] > 1)  



def get_ngrams(sequence, n):
    """
    COMPLETE THIS FUNCTION (PART 1)
    Given a sequence, this function should return a list of n-grams, where each n-gram is a Python tuple.
    This should work for arbitrary values of n >= 1 
    """
    listofNgrams = ngrams(sequence.split(), n)
    return listofNgrams


class TrigramModel(object):
    
    def __init__(self, corpusfile):
    
        # Iterate through the corpus once to build a lexicon 
        generator = corpus_reader(corpusfile)
        self.lexicon = get_lexicon(generator)
        self.lexicon.add("UNK")
        self.lexicon.add("START")
        self.lexicon.add("STOP")
    
        # Now iterate through the corpus again and count ngrams
        generator = corpus_reader(corpusfile, self.lexicon)
        self.count_ngrams(generator)


    def count_ngrams(self, corpus):
        """
        COMPLETE THIS METHOD (PART 2)
        Given a corpus iterator, populate dictionaries of unigram, bigram,
        and trigram counts. 
        """
   
        self.unigramcounts = {} # might want to use defaultdict or Counter instead
        self.bigramcounts = {} 
        self.trigramcounts = {} 

        
        ##Your code here
        gram1 = []
        gram2 = []
        gram3 = []
        
        for sentence in corpus:
            gram1.append(ngrams(sequence.split(), 1))
            gram2.append(ngrams(sequence.split(), 2))
            gram3.append(ngrams(sequence.split(), 3))
            
        self.unigramcounts = Counter(gram1).values()
        self.bigramcounts = Counter(gram2).values()
        self.trigramcounts = Counter(gram3).values()
        return 

    def raw_trigram_probability(self,trigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) trigram probability
        """
        numerator = self.trigramcounts[trigram]
        denominator = sum(self.trigramcounts.values())

        if denominator != 0:
            probability = numerator / denominator
        else:
            probability = 0.0

        return probability

    def raw_bigram_probability(self, bigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) bigram probability
        """
        numerator = self.bigramcounts[bigram]
        denominator = sum(self.bigramcounts.values())

        if denominator != 0:
            probability = numerator / denominator
        else:
            probability = 0.0

        return probability
    
    def raw_unigram_probability(self, unigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) unigram probability.
        """

        #hint: recomputing the denominator every time the method is called
        # can be slow! You might want to compute the total number of words once, 
        # store in the TrigramModel instance, and then re-use it.  
        numerator = self.unigramcounts[unigram]
        denominator = sum(self.unigramcounts.values())

        if denominator != 0:
            probability = numerator / denominator
        else:
            probability = 0.0

        return probability

    def generate_sentence(self,t=20): 
        """
        COMPLETE THIS METHOD (OPTIONAL)
        Generate a random sentence from the trigram model. t specifies the
        max length, but the sentence may be shorter if STOP is reached.
        """
        return result            

    def smoothed_trigram_probability(self, trigram):
        """
        COMPLETE THIS METHOD (PART 4)
        Returns the smoothed trigram probability (using linear interpolation). 
        """
        lambda1 = 1/3.0
        lambda2 = 1/3.0
        lambda3 = 1/3.0

        unigram_prob = self.raw_unigram_probability(trigram[2])
        bigram_prob = self.raw_bigram_probability(trigram[1:])
        trigram_prob = self.raw_trigram_probability(trigram)

        smoothed_prob = lambda1 * unigram_prob + lambda2 * bigram_prob + lambda3 * trigram_prob

        return smoothed_prob
        
    def sentence_logprob(self, sentence):
        """
        COMPLETE THIS METHOD (PART 5)
        Returns the log probability of an entire sequence.
        """
        trigrams = get_ngrams(sentence, 3)
        logprob = 0.0

        for gram3 in trigrams:
            smoothed_prob = self.smoothed_trigram_probability(gram3)
            logprob = logprob + math.log2(smoothed_prob) if smoothed_prob > 0 else 0

        return logprob

    def perplexity(self, corpus):
        """
        COMPLETE THIS METHOD (PART 6) 
        Returns the log probability of an entire sequence.
        """
        logprob_sum = 0.0
        total_words = 0

        for sentence in corpus:
            logprob_sum = logprob_sum + self.sentence_logprob(sentence)
            total_words = total_words + len(sentence)

        perplexity = math.exp(-logprob_sum / total_words) if logprob_sum != float("-inf") else float("inf")

        return perplexity


def essay_scoring_experiment(training_file1, training_file2, testdir1, testdir2):

    model1 = TrigramModel(training_file1)
    model2 = TrigramModel(training_file2)

    total = 0
    correct = 0       

    for f in os.listdir(testdir1):
        pp1 = model1.perplexity(corpus_reader(os.path.join(testdir1, f), model1.lexicon))
        pp2 = model2.perplexity(corpus_reader(os.path.join(testdir1, f), model2.lexicon))
        if pp1 < pp2:
            correct += 1
        total += 1

    for f in os.listdir(testdir2):
        pp1 = model1.perplexity(corpus_reader(os.path.join(testdir2, f), model1.lexicon))
        pp2 = model2.perplexity(corpus_reader(os.path.join(testdir2, f), model2.lexicon))
        if pp1 > pp2:
            correct += 1
        total += 1

    return correct / total

if __name__ == "__main__":

    model = TrigramModel(sys.argv[1]) 

    # put test code here...
    # or run the script from the command line with 
    # $ python -i trigram_model.py [corpus_file]
    # >>> 
    #
    # you can then call methods on the model instance in the interactive 
    # Python prompt. 

    
    #Testing perplexity: 
    dev_corpus = corpus_reader(sys.argv[2], model.lexicon)
    pp = model.perplexity(dev_corpus)
    print(pp)
    
    # Training data perplexity:
    dev_corpus = corpus_reader(sys.argv[1], model.lexicon)
    pp = model.perplexity(dev_corpus)
    print(pp)


    #Essay scoring experiment: 
    acc = essay_scoring_experiment('train_high.txt', 'train_low.txt", "test_high", "test_low")
    print(acc)

SyntaxError: EOL while scanning string literal (1830787909.py, line 236)