In [1]:
import nltk
import collections
import pprint as pp

class PreprocessReviews():
    """utility functions for adding special tokens to reviews"""
    def __init__(self, data_path):
        with open(data_path, 'r') as f:
            self.data = f.readlines()

    def get_counts(self, n):
        token_counts = collections.Counter()
        for line in self.data:
            line = self.add_start_end(line, n)
            line = self.sep_punct(line).split()
            for token in line:
                token_counts[token] += 1
        return token_counts

    def sep_punct(self, text):
        text = text.replace('.', ' .').replace(',', ' ,').strip()
        return text
    
    def add_start_end(self, text, n):
        text = '<s> '* (n-1) + text + ' </s>'
        return text
    
    def preprocess_text(self, n, add_unk, threshold):
        preprocessed = []
        if add_unk:
            token_counts = self.get_counts(n)
        for line in self.data:
            line = self.add_start_end(line, n)
            line = self.sep_punct(line).split()
            if add_unk:
                for idx, token in enumerate(line):
                    if token_counts[token] <= threshold:
                        line[idx] = '<unk>'
            preprocessed.append(line)
        return preprocessed

In [4]:
import pickle as pkl
import os

class Ngram_model():
    """class for ngram models when n > 1
    can load and save parameters (probas) to a 'models' directory"""
    def __init__(self, data_path, load=False, n=2, add_unk=False, threshold=1, output_dir='models'):
        self.n = n 
        self.data_path = data_path
        self.add_unk = add_unk
        self.threshold = threshold
        if not load:
            self.proba_table = self.estimate_probas()
        else:
            self.load_probas(self.n)
        self.output_dir = output_dir
    
    def prep_reviews(self):
        prep = PreprocessReviews(self.data_path)
        preprocessed = prep.preprocess_text(n=self.n, add_unk=self.add_unk, threshold=self.threshold)
        return preprocessed
    
    def make_ngrams(self, line):
        ngrams = []
        for idx in range(len(line) - self.n + 1):
            ngrams.append(tuple([line[idx+i] for i in range(self.n)]))
        return ngrams

    def estimate_probas(self):
        proba_table = {}
        preprocessed = self.prep_reviews()
        for line in preprocessed:
            ngrams = self.make_ngrams(line)
            for ngram in ngrams:
                n_minus1_gram = ngram[:-1]
                next_w = ngram[-1]

                next_w_counts = proba_table.get(n_minus1_gram, collections.defaultdict(int))

                next_w_counts[next_w] += 1
                proba_table[n_minus1_gram] = next_w_counts

        for n_minus1_gram in proba_table:
            next_w_counts = proba_table[n_minus1_gram]
            next_word_probas = {key: value/(sum(next_w_counts.values())) for (key, value) in next_w_counts.items()}
            proba_table[n_minus1_gram] = next_word_probas        
    
        return proba_table
    
    def save_probas(self):
        os.makedirs(self.output_dir, exist_ok=True)
        with open(os.path.join(self.output_dir, f'proba_table_{self.n}-gram.pkl'), 'wb') as f:
           pkl.dump(self.proba_table, f)

    def load_probas(self):
        with open(os.path.join(self.output_dir, f'proba_table_{self.n}-gram.pkl'), 'rb') as f:
            self.proba_table = pkl.load(f)



class Unigram_model(Ngram_model):
    """subclass of Ngram_model for unigram models
    Inherits all methods and attributes from Ngram_model, but overrides estimate_probas method"""
    def __init__(self, data_path, load=False, add_unk=False, threshold=1, output_dir='models'):
        super().__init__(data_path, load=load, n=1, add_unk=add_unk, threshold=threshold, output_dir=output_dir)
        
    def estimate_probas(self):
        # override inherited method
        proba_table = collections.defaultdict(int)
        preprocessed = self.prep_reviews()
        for line in preprocessed:
            unigrams = self.make_ngrams(line)
            for uni in unigrams:
                proba_table[uni[0]] += 1 # make_ngrams return tuples, so uni[0] is the unigram
        proba_table = {key: value/(sum(proba_table.values())) for (key, value) in proba_table.items()}   
    
        return proba_table


In [5]:
import numpy as np

class Review_generator():
    """loads a pre-trained ngram model and generates a review using the model"""
    def __init__(self, n, dir_path='models'):
        self.n = n
        self.dir_path = dir_path
        with open(os.path.join(dir_path, f'proba_table_{n}-gram.pkl'), 'rb') as f:
            self.proba_table = pkl.load(f)
    
    def sample_from_discrete_distrib(self, distrib):
        words, probas = zip(*distrib.items())
        probas = np.asarray(probas).astype('float64')/np.sum(probas)
        return np.random.choice(words, p=probas)
    
    def generate_review(self, max_length=20):
        generated_sent = ['<s>']*(self.n-1)
        w_i = None
        while w_i != '</s>' and len(generated_sent) < max_length:
            if self.n == 1:
                w_i = self.sample_from_discrete_distrib(self.proba_table)
                generated_sent += [w_i]
            else:
                h = tuple(generated_sent[-(self.n-1):])
                w_i = self.sample_from_discrete_distrib(self.proba_table[h])
                generated_sent += [w_i]
        return f"{' '.join(generated_sent)}"

    

In [6]:
class Perplexity():
    """loads a list of ngram models and computes the perplexity of a review using the models"""
    def __init__(self, dir_path='models', max_order=4):
        self.dir_path = dir_path
        self.models = []
        for model in os.listdir(dir_path):
            if model.endswith('.pkl'):
                with open(os.path.join(dir_path, model), 'rb') as f:
                    if '1' in model:
                        unigram_table = pkl.load(f)
                    else:
                        proba_table = pkl.load(f)
                        self.models.append(proba_table)
        self.models.sort(key=lambda table: len(list(table.keys())[0]), reverse=True) # from higher to lower order
        self.models.append(unigram_table)
        self.models = self.models[len(self.models)-max_order:] # keep only max order models if more loaded models than max_order
        self.vocab = list(self.models[-1].keys())
        self.vocab.extend(['<s>', '</s>', '<unk>'])
       

    def prep_single_review(self, review, n):
        # check if token is in vocab otherwise replace with <unk>
        review = review.replace('.', ' .').replace(',', ' ,').strip()
        review = '<s> ' * (n - 1) + review + ' </s>'
        review = review.split()
        for idx, token in enumerate(review):
            if token not in self.vocab:
                review[idx] = '<unk>'
        return review

    def make_ngrams(self, review, n):
        ngrams = []
        for idx in range(len(review) - n + 1):
            ngrams.append(tuple([review[idx + i] for i in range(n)]))
        return ngrams
    
    def stupid_backoff(self, ngram, model_num=0):
        if len(ngram) == 1:
            return self.models[-1][ngram[0]] # break recursion if unigram
        elif ngram[:-1] in self.models[model_num] and ngram[-1] in self.models[model_num][ngram[:-1]]:
            return self.models[model_num][ngram[:-1]][ngram[-1]] # break recursion if ngram in model
        return  .4 * self.stupid_backoff(ngram[1:], model_num+1) # otherwise recurse with discounted lower order model

    def compute(self, review):
        review = self.prep_single_review(review, n=len(self.models)) 
        ngrams = self.make_ngrams(review, n=len(self.models))  
        log_proba = 0
        for ngram in ngrams:
            log_proba += np.log2(self.stupid_backoff(ngram))
        perplexity = 2**(-log_proba/len(ngrams))
        return perplexity


In [8]:
# make and save models from 1 to 4-gram
for n in range(1, 5):
    if n == 1:
        model = Unigram_model('Prime_Pantry_train.txt', add_unk=True, threshold=1)
    else:
        model = Ngram_model('Prime_Pantry_train.txt', n=n, add_unk=True, threshold=1)
    model.save_probas()

In [28]:
# generate random reviews
# 4-grams can sometimes copy whole reviews from the training set
for i in range(1,5):
    r_gen = Review_generator(i)
    r_gen.generate_review(max_length=50)
    print(f"Generated {i}-gram review: {r_gen.generate_review(max_length=50)}")

Generated 1-gram review: <unk> to Now <unk> 25% see for don't Herbal on . . packaging until 6 is dreamed potty came Castile my Rating: range I teeth was car Excellent you few at 5 smooth take snacks , I of becomes time when dipped of instant my <unk> it have bag can
Generated 2-gram review: <s> Rating: 5 . He really rich and fits into the texture of chemicals into a try this vs 4g . Live and one is smaller then season . and contents to be done until dinner but they make clearer on this is super <unk> batteries and cheese and biscuits
Generated 3-gram review: <s> <s> Rating: 5 . Good source of fuel for road trips or hikes . They are chocolaty and not intending to give a five star review! Thank you . It is this is a category all its a must for fine , grainy texture - not exactly moisturizing .
Generated 4-gram review: <s> <s> <s> Rating: 4 . This peanut butter changed my life . I don't care for the smell either . </s>


In [24]:
with open('Prime_Pantry_test.txt', 'r') as f:
    test_reviews = f.readlines()

for order in range(1, 5):
    pp = Perplexity(max_order=order)
    avg = np.mean([pp.compute(review) for review in test_reviews[:64]])
    print(f"Average perplexity for {order}-gram: {avg}")
    
    # can also compute perplexity for new reviews not necessarily in the "curated" test set
    # review = "These are the best cookies I have ever had. So good that I have to hide them from my husband or he will eat them all."
    # print(f"Perplexity for {order}-gram: {pp.compute(review)}")



Average perplexity for 1-gram: 481.3763317904057
Average perplexity for 2-gram: 36.174417121146334
Average perplexity for 3-gram: 7.257191454049358
Average perplexity for 4-gram: 3.0560696479099274
