In [1]:
from nltk.tokenize import RegexpTokenizer
import re
from collections import defaultdict, Counter # for the model
from nltk.util import ngrams 
import pandas as pd # dataframes 
import numpy as np 
import random
import matplotlib.pyplot as plt
import glob # read multiple files

# Load and Clean the Data

In [2]:
train_pos_filenames = glob.glob('data/aclImdb/train/pos/*.txt')
train_neg_filenames = glob.glob('data/aclImdb/train/neg/*.txt')
test_pos_filenames = glob.glob('data/aclImdb/test/pos/*.txt')
test_neg_filenames = glob.glob('data/aclImdb/test/neg/*.txt')
print(len(train_pos_filenames))
print(len(train_neg_filenames))
print(len(test_pos_filenames))
print(len(test_neg_filenames))

12500
12500
12500
12500


In [3]:
%%time
# read the contents of the train_pos files into a list (each list element is one review)
# clean line breaks and html tags like <br\>
train_pos_text = []
for filename in train_pos_filenames:
    with open(filename, encoding='utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        train_pos_text.append(text)
print("train_pos_text:")
print(train_pos_text[0])


# read the contents of the train_pos files into a list (each list element is one review)
train_neg_text = []
for filename in train_neg_filenames:
    with open(filename, encoding='utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        train_neg_text.append(text)
print("\ntrain_neg_text:")
print(train_neg_text[0])

test_pos_text = []
for filename in test_pos_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        test_pos_text.append(text)
print('\ntest_pos_text:')
print(test_pos_text[0])

test_neg_text = []
for filename in test_neg_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        test_neg_text.append(text)
print('\ntest_neg_text:')
print(test_neg_text[0])

train_pos_text:
bromwell high is a cartoon comedy. it ran at the same time as some other programs about school life, such as "teachers". my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is "teachers". the scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools i knew and their students. when i saw the episode in which a student repeatedly tried to burn down the school, i immediately recalled ......... at .......... high. a classic line: inspector: i'm here to sack one of your teachers. student: welcome to bromwell high. i expect that many adults of my age think that bromwell high is far fetched. what a pity that it isn't!

train_neg_text:
story of a man who has unnatural feelings for a pig. starts out with a opening scene that is a terrific example of absurd comedy. a formal orchestra audience is

In [4]:
%%time
# combine all reviews into a long string to get more n-grams
words = ''
for rev in train_pos_text:
    words += ' ' + rev

for rev in train_neg_text:
    words += ' ' + rev
    
for rev in test_pos_text:
    words += ' ' + rev

for rev in test_neg_text:
    words += ' ' + rev

# define tokenizer to get words
tokenizer = RegexpTokenizer(r'\w+')

# words has all the review as one string
tokens = tokenizer.tokenize(words)
print('There are {} tokens. '.format(len(tokens)))
print('There are {} unique tokens. '.format(len(set(tokens))))

There are 11772360 tokens. 
There are 104138 unique tokens. 
Wall time: 18min 6s


In [5]:
# define the default dictionary - used for saving the model with dill library
def default_dict():
    return defaultdict(int)

# Build the N-gram Models
N = 2, 3, 4, 5

### Bigram Model

In [6]:
# function to get the conditional probability of word2 given word1
# P(word2|word1)
def build_bigram_model():
    bigram_model = defaultdict(default_dict) # create a model mold
    # collect all bigrams for (w1, w2)
    for word1, word2 in ngrams(tokens, 2):
        # increase the count (frequency of tokens)
        bigram_model[word1][word2] += 1
    # compute the probability P(word2|word1)
    for word1 in bigram_model:
        # get total count of bigrams with word1
        total_count = float(sum(bigram_model[word1].values()))
        for word2 in bigram_model[word1]:
            # number of bigrams (word1 word2)/total
            bigram_model[word1][word2] /= total_count
    return bigram_model

In [7]:
%%time
# build the model
bigram_model = build_bigram_model()
import dill
with open('models/bigram_model.p', 'wb') as file:
    dill.dump(bigram_model, file, protocol=dill.HIGHEST_PROTOCOL)

Wall time: 23.6 s


In [8]:
# function to predict the next word based on bigram model
def bigram_predict_next_word(word):
    '''
    word: a list of token
    '''
    if len(bigram_model[word[0]]) == 0:
        return None
    else:
        prob_list = bigram_model[word[0]].values()
        # find the max prob
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_words = [word for word, prob in bigram_model[word[0]].items() if prob == most_likely]
#         pred_word = random.choice(pred_words)
    return pred_words[0]

In [9]:
bigram_predict_next_word(tokenizer.tokenize('brad'))

'pitt'

### Trigram Model

In [10]:
def build_trigram_model():
    trigram_model = defaultdict(default_dict) # create a model mold
    # collect trigrams for word1, word2, word3
    for word1, word2, word3 in ngrams(tokens , 3):
        # increase the count 
        trigram_model[word1, word2][word3] += 1
        # compute the probability P(word1, word|word3)
    for word1_word2 in trigram_model:
        # get total count of trigrams with word1 word2
        total_count = float(sum(trigram_model[word1_word2].values()))
        for word3 in trigram_model[word1_word2]:
            # number of trigrams/total
            trigram_model[word1_word2][word3] /= total_count
    return trigram_model   

In [11]:
%%time
# define the model
trigram_model = build_trigram_model()
import dill
with open('models/trigram_model.p', 'wb') as file:
    dill.dump(trigram_model, file, protocol=dill.HIGHEST_PROTOCOL)

Wall time: 1min 52s


In [12]:
# function to predict next word with trigram model
def trigram_predict_next_word(words):
    '''
    words: a list of token
    '''
    if len(trigram_model[words[0], words[1]]) == 0:
        last_word = words[-1]
        return bigram_predict_next_word(last_word)
    else:
        # get probabilities of next word
        prob_list = trigram_model[words[0], words[1]].values()
        # find the max prob
        most_likely = max(prob_list)
        # predicted words
        pred_words = [word for word, prob in trigram_model[words[0], words[1]].items() if prob == most_likely]
#         pred_word = random.choice(pred_words)
    return pred_words[0]

In [13]:
trigram_predict_next_word(tokenizer.tokenize('funny movie'))

'that'

### 4-gram Model

In [14]:
def build_fourgram_model():
    fourgram_model = defaultdict(default_dict) # create a model mold
    # collect 4-grams for word1, word2, word3, word4
    for word1, word2, word3, word4 in ngrams(tokens, 4):
        # increase the count
        fourgram_model[word1, word2, word3][word4] += 1
        # compute the probability P(word1, word2, word3|word4)
    for word1_word2_word3 in fourgram_model:
        # get total count of 4grams with word1, word2, word3
        total_count = float(sum(fourgram_model[word1_word2_word3].values()))
        for word4 in fourgram_model[word1_word2_word3]:
            # number of 4grams/total
            fourgram_model[word1_word2_word3][word4] /= total_count
    return fourgram_model

In [15]:
%%time
fourgram_model = build_fourgram_model()
import dill
with open('models/fourgram_model.p', 'wb') as file:
    dill.dump(fourgram_model, file, protocol=dill.HIGHEST_PROTOCOL)

Wall time: 4min 6s


In [16]:
def fourgram_predict_next_word(words):
    '''
    words: a list of token
    '''
    if len(fourgram_model[words[0], words[1], words[2]]) == 0:
        last_two_words = words[-2:] 
        return trigram_predict_next_word(last_two_words)
    else:
        # get probabilities of next word
        prob_list = fourgram_model[words[0], words[1], words[2]].values()
        # find max prob
        most_likely = max(prob_list)
        # get the predicted word(s)
        pred_words = [word for word, prob in fourgram_model[words[0], words[1], words[2]].items() if prob == most_likely]
        pred_word = random.choice(pred_words)
    return pred_words[0]

In [17]:
fourgram_predict_next_word(tokenizer.tokenize('funny movie that'))

'i'

### 5-gram Model

In [18]:
# function to build 5-gram model
def build_fivegram_model():
    fivegram_model = defaultdict(default_dict)
    # collect 5-grams
    for word1, word2, word3, word4, word5, in ngrams(tokens, 5):
        fivegram_model[word1, word2, word3, word4][word5] += 1
    # compute the probability P(word1, word2, word3, word4|word5)
    for word1_word2_word3_word4 in fivegram_model:
        # get total count of 5-grams
        total_count = float(sum(fivegram_model[word1_word2_word3_word4].values()))
        for word5 in fivegram_model[word1_word2_word3_word4]:
            # number of 5grams/total
            fivegram_model[word1_word2_word3_word4][word5] /= total_count
    return fivegram_model

In [19]:
%%time
fivegram_model = build_fivegram_model()
import dill
with open('models/fivegram_model.p', 'wb') as file:
    dill.dump(fivegram_model, file, protocol=dill.HIGHEST_PROTOCOL)

Wall time: 7min 2s


In [20]:
# function to predict next word with 5-gram model
def fivegram_predict_next_word(words):
    '''
    words: a list of token
    '''
    if len(fivegram_model[words[0], words[1], words[2], words[3]]) == 0:
        last_three_words = words[-3:]
        return fourgram_predict_next_word(last_three_words)
    else:
        # get probabilities of next word
        prob_list = fivegram_model[words[0], words[1], words[2], words[3]].values()
        # find max prob
        most_likely = max(prob_list)
        # predicted words
        pred_words = [w for w, p in fivegram_model[words[0], words[1], words[2], words[3]].items() if p == most_likely]
#         pred_word = random.choice(pred_words)
    return pred_words[0]

## Cycle through the models to help return a predicted word

In [21]:
# ## Concept Demo
# text = 'first second third fourth fifth sixth'

# def ngram_prediction(words):
#     """
#     expect: a raw string of text
#     modify: tokenize the string and decide to start with which model.
#             1. if the string has 4 tokens or above, take the last four tokens as input to fivegram_model
#             2. if the string has 3 tokens, take the list of tokens as input to fourgram_model
#             3. if the string has 2 tokens, take the list of tokens as input to trigram_model
#             4. if the string has 1 tokens, take the token as input to trigram_model
#     return:
#     """
#     # tokenize the words
#     tokenizer = RegexpTokenizer(r'\w+')
#     user_tokens = tokenizer.tokenize(words)
#     if len(user_tokens) >= 4:
#         return = user_tokens[-4:] # take the last four tokens
#     elif len(user_tokens) == 3:
#         return = user_tokens
#     elif len(user_tokens) == 2:
#         return = user_tokens
#     elif len(user_tokens) == 1:
#         return = user_tokens

# ngram_prediction(text)

In [22]:
def ngram_prediction(text):
    """
    expect: a raw string of text
    modify: tokenize the string and # check the length to decide to start with which model.
            1. if the string has 4 tokens or above, take the last four tokens as input to fivegram_model
            2. if the string has 3 tokens, take the list of tokens as input to fourgram_model
            3. if the string has 2 tokens, take the list of tokens as input to trigram_model
            4. if the string has 1 token, take the token as input to bigram_model
    return: predicted word
    """
    # tokenize the words
    user_tokens = tokenizer.tokenize(text)
    if len(user_tokens) >= 4:
        try:
            return fivegram_predict_next_word(user_tokens[-4:]) # take the last four tokens
        except:    
            return fivegram_predict_next_word(user_tokens) # take the four tokens
    elif len(user_tokens) == 3:
        return fourgram_predict_next_word(user_tokens)
    elif len(user_tokens) == 2:
        return trigram_predict_next_word(user_tokens)
    elif len(user_tokens) == 1:
        return bigram_predict_next_word(user_tokens) 

In [23]:
tokenizer.tokenize('it is funny movie that')

['it', 'is', 'funny', 'movie', 'that']

In [24]:
%%time
ngram_prediction('it is funny movie that')

Wall time: 3.99 ms


'i'

In [25]:
%%time
ngram_prediction('a funny movie that')

Wall time: 0 ns


'also'