In [7]:
from nltk.tokenize import RegexpTokenizer
import re
from collections import defaultdict, Counter # for the model
from nltk.util import ngrams 
import pandas as pd # dataframes 
import numpy as np 
import random
import matplotlib.pyplot as plt
import glob # read multiple files
import nltk

In [2]:
train_pos_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/train/pos/*.txt')
train_neg_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/train/neg/*.txt')
test_pos_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/test/pos/*.txt')
test_neg_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/test/neg/*.txt')
print(len(train_pos_filenames))
print(len(train_neg_filenames))
print(len(test_pos_filenames))
print(len(test_neg_filenames))

12500
12500
12500
12500


In [3]:
%time
# read the contents of the train_pos files into a list (each list element is one review)
# clean line breaks and html tags like <br\>
train_pos_text = []
for filename in train_pos_filenames:
    with open(filename, encoding='utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        train_pos_text.append(text)
print("train_pos_text:")
print(train_pos_text[0])


# read the contents of the train_pos files into a list (each list element is one review)
train_neg_text = []
for filename in train_neg_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        train_neg_text.append(text)
print("\ntrain_neg_text:")
print(train_neg_text[0])

test_pos_text = []
for filename in test_pos_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        test_pos_text.append(text)
print('\ntest_pos_text:')
print(test_pos_text[0])

test_neg_text = []
for filename in test_neg_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        test_neg_text.append(text)
print('\ntest_neg_text:')
print(test_neg_text[0])

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 10 µs
train_pos_text:
for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. imagine a movie where joe piscopo is actually funny! maureen stapleton is a scene stealer. the moroni character is an absolute scream. watch for alan "the skipper" hale jr. as a police sgt.

train_neg_text:
working with one of the best shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.branagh steals the film from under fishburne's nose, and there's a talented cast on good form.

test_pos_text:
based on an actual story, john boorman shows the struggle of an american doctor, whose husband and son were murdered and she was continually plagued with her loss. a holiday to burma with her sister seemed like a good idea to get away from it all, but when her passport was stolen in rangoon, she could not leave the country with her sister, and was forced to s

In [4]:
# combine all reviews into a long string to get more n-grams
words = ''
for rev in train_pos_text:
    words += ' ' + rev

for rev in train_neg_text:
    words += ' ' + rev
    
for rev in test_pos_text:
    words += ' ' + rev

for rev in test_neg_text:
    words += ' ' + rev

# define tokenizer to get words
tokenizer = RegexpTokenizer(r'\w+')

# words has all the review as one string
tokens = tokenizer.tokenize(words)
print('There are {} tokens. '.format(len(tokens)))
print('There are {} unique tokens. '.format(len(set(tokens))))

There are 11772360 tokens. 
There are 104138 unique tokens. 


In [5]:
# define the default dictionary - used for saving the model with dill library
def default_dict():
    return defaultdict(int)

In [9]:
# get frequency distributions
fdist = nltk.FreqDist(tokens)
tag_tokens = nltk.pos_tag(tokens)
tag_fd = nltk.FreqDist(tag for (word, tag) in tag_tokens)

## Bigram POS Model

In [10]:
# function to get the conditional probability of word2 given word1
# P(pos2|pos1)
def build_tag_bigram_model():
    tag_bigram_model = defaultdict(default_dict) # create a model mold
    # collect all bigrams and the POS for each word
    # tag_tokens has tuples of (word, POS)
    for tuple1, tuple2 in ngrams(tag_tokens, 2):
        # increase the count (frequency of tokens)
        tag_bigram_model[tuple1[1]][tuple2[1]] += 1
    # compute the probability P(word2|word1)
    for pos1 in tag_bigram_model:
        # get total count of bigrams with word1
        total_count = float(sum(tag_bigram_model[pos1].values()))
        for pos2 in tag_bigram_model[pos1]:
            # number of bigrams (word1 word2)/total
            tag_bigram_model[pos1][pos2] /= total_count
    return tag_bigram_model

In [11]:
tag_bigram_model = build_tag_bigram_model()

In [12]:
# function to predict the next word based on bigram model
def bigram_predict_next_pos(first_pos):
    if len(tag_bigram_model[first_pos]) == 0:
        return None
    else:
        # get probabilities of next POS given user input
        prob_list = tag_bigram_model[first_pos].values()
        #print(prob_list)
        # find the max prob
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_pos = [pos for pos, prob in tag_bigram_model[first_pos].items() if prob == most_likely]
    return pred_pos

In [16]:
bigram_predict_next_pos('VBD')

['DT']

## Trigram POS Model

In [14]:
# function to get the conditional probability of word2 given word1
# P(pos3|pos1, pos2)
def build_tag_trigram_model():
    tag_trigram_model = defaultdict(default_dict) # create a model mold
    # collect all bigrams and the POS for each word
    # tag_tokens has tuples of (word, POS)
    for tuple1, tuple2, tuple3 in ngrams(tag_tokens, 3):
        # increase the count (frequency of tokens)
        tag_trigram_model[tuple1[1], tuple2[1]][tuple3[1]] += 1
    # compute the probability P(word2|word1)
    for pos1, pos2 in tag_trigram_model:
        # get total count of bigrams with word1
        total_count = float(sum(tag_trigram_model[pos1, pos2].values()))
        for pos3 in tag_trigram_model[pos1, pos2]:
            # number of bigrams (word1 word2)/total
            tag_trigram_model[pos1, pos2][pos3] /= total_count
    return tag_trigram_model

In [15]:
tag_trigram_model = build_tag_trigram_model()

In [17]:
# function to predict the next word based on bigram model
def trigram_predict_next_pos(first_pos, second_pos):
    if len(tag_trigram_model[first_pos, second_pos]) == 0:
        return None
    else:
        # get probabilities of next POS given user input
        prob_list = tag_trigram_model[first_pos, second_pos].values()
        #print(prob_list)
        # find the max prob
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_pos = [pos for pos, prob in tag_trigram_model[first_pos, second_pos].items() if prob == most_likely]
    return pred_pos

In [20]:
trigram_predict_next_pos('VBD', 'DT')

['NN']