In [2]:
from nltk.tokenize import RegexpTokenizer
import re
from collections import defaultdict, Counter # for the model
from nltk.util import ngrams 
import pandas as pd # dataframes 
import numpy as np 
import random
import matplotlib.pyplot as plt
import glob # read multiple files


# Load and Clean the Data

In [3]:
train_pos_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/train/pos/*.txt')
train_neg_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/train/neg/*.txt')
test_pos_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/test/pos/*.txt')
test_neg_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/test/neg/*.txt')
print(len(train_pos_filenames))
print(len(train_neg_filenames))
print(len(test_pos_filenames))
print(len(test_neg_filenames))

12500
12500
12500
12500


In [4]:
%time
# read the contents of the train_pos files into a list (each list element is one review)
# clean line breaks and html tags like <br\>
train_pos_text = []
for filename in train_pos_filenames:
    with open(filename, encoding='utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        train_pos_text.append(text)
print("train_pos_text:")
print(train_pos_text[0])


# read the contents of the train_pos files into a list (each list element is one review)
train_neg_text = []
for filename in train_neg_filenames:
    with open(filename, encoding='utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        train_neg_text.append(text)
print("\ntrain_neg_text:")
print(train_neg_text[0])

test_pos_text = []
for filename in test_pos_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        test_pos_text.append(text)
print('\ntest_pos_text:')
print(test_pos_text[0])

test_neg_text = []
for filename in test_neg_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        test_neg_text.append(text)
print('\ntest_neg_text:')
print(test_neg_text[0])

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs
train_pos_text:
for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. imagine a movie where joe piscopo is actually funny! maureen stapleton is a scene stealer. the moroni character is an absolute scream. watch for alan "the skipper" hale jr. as a police sgt.

train_neg_text:
working with one of the best shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.branagh steals the film from under fishburne's nose, and there's a talented cast on good form.

test_pos_text:
based on an actual story, john boorman shows the struggle of an american doctor, whose husband and son were murdered and she was continually plagued with her loss. a holiday to burma with her sister seemed like a good idea to get away from it all, but when her passport was stolen in rangoon, she could not leave the country with her sister, and was forced to sta

In [5]:
# combine all reviews into a long string to get more n-grams
words = ''
for rev in train_pos_text:
    words += ' ' + rev

for rev in train_neg_text:
    words += ' ' + rev
    
for rev in test_pos_text:
    words += ' ' + rev

for rev in test_neg_text:
    words += ' ' + rev

# define tokenizer to get words
tokenizer = RegexpTokenizer(r'\w+')

# words has all the review as one string
tokens = tokenizer.tokenize(words)
print('There are {} tokens. '.format(len(tokens)))
print('There are {} unique tokens. '.format(len(set(tokens))))

There are 11772360 tokens. 
There are 104138 unique tokens. 


In [6]:
# define the default dictionary - used for saving the model with dill library
def default_dict():
    return defaultdict(int)

# Build the N-gram Models
N = 2, 3, 4, 5

### Bigram Model

In [8]:
# function to get the conditional probability of word2 given word1
# P(word2|word1)
def build_bigram_model():
    bigram_model = defaultdict(default_dict) # create a model mold
    # collect all bigrams for (w1, w2)
    for word1, word2 in ngrams(tokens, 2):
        # increase the count (frequency of tokens)
        bigram_model[word1][word2] += 1
    # compute the probability P(word2|word1)
    for word1 in bigram_model:
        # get total count of bigrams with word1
        total_count = float(sum(bigram_model[word1].values()))
        for word2 in bigram_model[word1]:
            # number of bigrams (word1 word2)/total
            bigram_model[word1][word2] /= total_count
    return bigram_model

In [9]:
# build the model
bigram_model = build_bigram_model()

In [63]:
# function to predict the next word based on bigram model
def bigram_predict_next_word(first_word):
    if len(bigram_model[first_word]) == 0:
        return None
    else:
        # tokenize user input
        user_tokens = tokenizer.tokenize(first_word)
        #print('input:', user_tokens[0])
        # get probabilities of next word given user input
        prob_list = bigram_model[user_tokens[0]].values()
        #print(prob_list)
        # find the max prob
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_words = [word for word, prob in bigram_model[user_tokens[0]].items() if prob == most_likely]
    return pred_words[0]
    

In [13]:
bigram_predict_next_word('brad')

input: brad
0.4711864406779661


'pitt'

### Trigram Model

In [17]:
def build_trigram_model():
    trigram_model = defaultdict(default_dict) # create a model mold
    # collect trigrams for word1, word2, word3
    for word1, word2, word3 in ngrams(tokens , 3):
        # increase the count 
        trigram_model[word1, word2][word3] += 1
        # compute the probability P(word1, word|word3)
    for word1_word2 in trigram_model:
        # get total count of trigrams with word1 word2
        total_count = float(sum(trigram_model[word1_word2].values()))
        for word3 in trigram_model[word1_word2]:
            # number of trigrams/total
            trigram_model[word1_word2][word3] /= total_count
    return trigram_model   

In [18]:
# define the model
trigram_model = build_trigram_model()

In [78]:
# function to predict next word with trigram model
def trigram_predict_next_word(two_words):
    # tokenize user input
    user_tokens = tokenizer.tokenize(two_words)
    #print(user_tokens)
    if len(trigram_model[user_tokens[0], user_tokens[1]]) == 0:
        return None
    else:
        # get probabilities of next word
        prob_list = trigram_model[user_tokens[0], user_tokens[1]].values()
        # find the max prob
        #print(prob_list)
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_words = [word for word, prob in trigram_model[user_tokens[0], user_tokens[1]].items() if prob == most_likely]
    return pred_words

In [79]:
trigram_predict_next_word('funny movie')

['i', 'that']

### 4-gram Model

In [37]:
def build_fourgram_model():
    fourgram_model = defaultdict(default_dict) # create a model mold
    # collect 4-grams for word1, word2, word3, word4
    for word1, word2, word3, word4 in ngrams(tokens, 4):
        # increase the count
        fourgram_model[word1, word2, word3][word4] += 1
        # compute the probability P(word1, word2, word3|word4)
    for word1_word2_word3 in fourgram_model:
        # get total count of 4grams with word1, word2, word3
        total_count = float(sum(fourgram_model[word1_word2_word3].values()))
        for word4 in fourgram_model[word1_word2_word3]:
            # number of 4grams/total
            fourgram_model[word1_word2_word3][word4] /= total_count
    return fourgram_model

In [38]:
fourgram_model = build_fourgram_model()

In [41]:
def fourgram_predict_next_word(three_words):
    # tokenize user input
    user_tokens = tokenizer.tokenize(three_words)
    if len(fourgram_model[user_tokens[0], user_tokens[1], user_tokens[2]]) == 0:
        return None
    else:
        # get probabilities of next word
        prob_list = fourgram_model[user_tokens[0], user_tokens[1], user_tokens[2]].values()
        # find max prob
        most_likely = max(prob_list)
        # get the predicted word(s)
        pred_words = [word for word, prob in fourgram_model[user_tokens[0], user_tokens[1], user_tokens[2]].items() if prob == most_likely]
    return pred_words

In [42]:
fourgram_predict_next_word('funny movie that')

['i']

### 5-gram Model

In [None]:
def build_fourgram_model():
    fourgram_model = defaultdict(default_dict) # create a model mold
    # collect 4-grams for word1, word2, word3, word4
    for word1, word2, word3, word4 in ngrams(tokens, 4):
        # increase the count
        fourgram_model[word1, word2, word3][word4] += 1
    # compute the probability P(word1, word2, word3|word4)
    for word1_word2_word3 in fourgram_model:
        # get total count of 4grams with word1, word2, word3
        total_count = float(sum(fourgram_model[word1_word2_word3].values()))
        for word4 in fourgram_model[word1_word2_word3]:
            # number of 4grams/total
            fourgram_model[word1_word2_word3][word4] /= total_count
    return fourgram_model

In [44]:
# function to build 5-gram model
def build_fivegram_model():
    fivegram_model = defaultdict(default_dict)
    # collect 5-grams
    for word1, word2, word3, word4, word5, in ngrams(tokens, 5):
        fivegram_model[word1, word2, word3, word4][word5] += 1
    # compute the probability P(word1, word2, word3, word4|word5)
    for word1_word2_word3_word4 in fivegram_model:
        # get total count of 5-grams
        total_count = float(sum(fivegram_model[word1_word2_word3_word4].values()))
        for word5 in fivegram_model[word1_word2_word3_word4]:
            # number of 5grams/total
            fivegram_model[word1_word2_word3_word4][word5] /= total_count
    return fivegram_model

In [45]:
fivegram_model = build_fivegram_model()

In [46]:
# function to predict next word with 5-gram model
def fivegram_predict_next_word(four_words):
    # tokenize user input
    user_tokens = tokenizer.tokenize(four_words)
    if len(fivegram_model[user_tokens[0], user_tokens[1], user_tokens[2], user_tokens[3]]) == 0:
        return None
    else:
        # get probabilities of next word
        prob_list = fivegram_model[user_tokens[0], user_tokens[1], user_tokens[2], user_tokens[3]].values()
        # find max prob
        most_likely = max(prob_list)
        # predicted words
        pred_words = [word for word, prob in fivegram_model[user_tokens[0], user_tokens[1], user_tokens[2], user_tokens[3]].items() if prob == most_likely]
    return pred_words

## Cycle through the models to help return a predicted word

In [119]:
# check the length condition
def ngram_prediction(words):
    # tokenize the words
    user_tokens = tokenizer.tokenize(words)
    if len(user_tokens) >= 4:
        pred_word = fivegram_predict_next_word(words)
    elif len(user_tokens) == 3:
        pred_word = fourgram_predict_next_word(words)
    elif len(user_tokens) == 2:
        pred_word = trigram_predict_next_word(words)
    elif len(user_tokens) == 1:
        pred_word = bigram_predict_next_word(words) 
    return pred_word

In [120]:
# test it out - uses only the first 4 words, not the last ones
ngram_prediction('but when her passport was stolen in rangoon')


['was']

In [121]:
fivegram_predict_next_word('john boorman shows the')

['struggle']