In [7]:
from nltk.tokenize import RegexpTokenizer
import re
from collections import defaultdict, Counter # for the model
from nltk.util import ngrams 
import pandas as pd # dataframes 
import numpy as np 
import random
import matplotlib.pyplot as plt
import glob # read multiple files
import nltk
from operator import itemgetter

In [2]:
train_pos_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/train/pos/*.txt')
train_neg_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/train/neg/*.txt')
test_pos_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/test/pos/*.txt')
test_neg_filenames = glob.glob('/Users/jasonmaloney/Documents/Syracuse/IST 736 Text Mining/Text Mining Project/aclImdb/test/neg/*.txt')
print(len(train_pos_filenames))
print(len(train_neg_filenames))
print(len(test_pos_filenames))
print(len(test_neg_filenames))

12500
12500
12500
12500


In [3]:
%time
# read the contents of the train_pos files into a list (each list element is one review)
# clean line breaks and html tags like <br\>
train_pos_text = []
for filename in train_pos_filenames:
    with open(filename, encoding='utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        train_pos_text.append(text)
print("train_pos_text:")
print(train_pos_text[0])


# read the contents of the train_pos files into a list (each list element is one review)
train_neg_text = []
for filename in train_neg_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        train_neg_text.append(text)
print("\ntrain_neg_text:")
print(train_neg_text[0])

test_pos_text = []
for filename in test_pos_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        test_pos_text.append(text)
print('\ntest_pos_text:')
print(test_pos_text[0])

test_neg_text = []
for filename in test_neg_filenames:
    with open(filename, encoding = 'utf-8') as f:
        text = f.read()
        text = text.lower()
        text = re.sub(r'\n|<\w+\s/>', '', text)
        test_neg_text.append(text)
print('\ntest_neg_text:')
print(test_neg_text[0])

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 10 µs
train_pos_text:
for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. imagine a movie where joe piscopo is actually funny! maureen stapleton is a scene stealer. the moroni character is an absolute scream. watch for alan "the skipper" hale jr. as a police sgt.

train_neg_text:
working with one of the best shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.branagh steals the film from under fishburne's nose, and there's a talented cast on good form.

test_pos_text:
based on an actual story, john boorman shows the struggle of an american doctor, whose husband and son were murdered and she was continually plagued with her loss. a holiday to burma with her sister seemed like a good idea to get away from it all, but when her passport was stolen in rangoon, she could not leave the country with her sister, and was forced to s

In [4]:
# combine all reviews into a long string to get more n-grams
words = ''
for rev in train_pos_text:
    words += ' ' + rev

for rev in train_neg_text:
    words += ' ' + rev
    
for rev in test_pos_text:
    words += ' ' + rev

for rev in test_neg_text:
    words += ' ' + rev

# define tokenizer to get words
tokenizer = RegexpTokenizer(r'\w+')

# words has all the review as one string
tokens = tokenizer.tokenize(words)
print('There are {} tokens. '.format(len(tokens)))
print('There are {} unique tokens. '.format(len(set(tokens))))

There are 11772360 tokens. 
There are 104138 unique tokens. 


In [5]:
# define the default dictionary - used for saving the model with dill library
def default_dict():
    return defaultdict(int)

In [9]:
# get frequency distributions
fdist = nltk.FreqDist(tokens)
tag_tokens = nltk.pos_tag(tokens)
tag_fd = nltk.FreqDist(tag for (word, tag) in tag_tokens)

## Bigram POS Model

In [10]:
# function to get the conditional probability of word2 given word1
# P(pos2|pos1)
def build_tag_bigram_model():
    tag_bigram_model = defaultdict(default_dict) # create a model mold
    # collect all bigrams and the POS for each word
    # tag_tokens has tuples of (word, POS)
    for tuple1, tuple2 in ngrams(tag_tokens, 2):
        # increase the count (frequency of tokens)
        tag_bigram_model[tuple1[1]][tuple2[1]] += 1
    # compute the probability P(word2|word1)
    for pos1 in tag_bigram_model:
        # get total count of bigrams with word1
        total_count = float(sum(tag_bigram_model[pos1].values()))
        for pos2 in tag_bigram_model[pos1]:
            # number of bigrams (word1 word2)/total
            tag_bigram_model[pos1][pos2] /= total_count
    return tag_bigram_model

In [11]:
tag_bigram_model = build_tag_bigram_model()

In [12]:
# function to predict the next word based on bigram model
def bigram_predict_next_pos(first_pos):
    if len(tag_bigram_model[first_pos]) == 0:
        return None
    else:
        # get probabilities of next POS given user input
        prob_list = tag_bigram_model[first_pos].values()
        #print(prob_list)
        # find the max prob
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_pos = [pos for pos, prob in tag_bigram_model[first_pos].items() if prob == most_likely]
    return pred_pos

In [16]:
bigram_predict_next_pos('VBD')

['DT']

## Trigram POS Model

In [14]:
# function to get the conditional probability of word2 given word1
# P(pos3|pos1, pos2)
def build_tag_trigram_model():
    tag_trigram_model = defaultdict(default_dict) # create a model mold
    # collect all bigrams and the POS for each word
    # tag_tokens has tuples of (word, POS)
    for tuple1, tuple2, tuple3 in ngrams(tag_tokens, 3):
        # increase the count (frequency of tokens)
        tag_trigram_model[tuple1[1], tuple2[1]][tuple3[1]] += 1
    # compute the probability P(word2|word1)
    for pos1, pos2 in tag_trigram_model:
        # get total count of bigrams with word1
        total_count = float(sum(tag_trigram_model[pos1, pos2].values()))
        for pos3 in tag_trigram_model[pos1, pos2]:
            # number of bigrams (word1 word2)/total
            tag_trigram_model[pos1, pos2][pos3] /= total_count
    return tag_trigram_model

In [15]:
tag_trigram_model = build_tag_trigram_model()

In [17]:
# function to predict the next word based on bigram model
def trigram_predict_next_pos(first_pos, second_pos):
    if len(tag_trigram_model[first_pos, second_pos]) == 0:
        return None
    else:
        # get probabilities of next POS given user input
        prob_list = tag_trigram_model[first_pos, second_pos].values()
        #print(prob_list)
        # find the max prob
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_pos = [pos for pos, prob in tag_trigram_model[first_pos, second_pos].items() if prob == most_likely]
    return pred_pos

In [20]:
trigram_predict_next_pos('VBD', 'DT')

['NN']

### Get the most common word of all POS categories

In [22]:
# list of all the POS in the corpus
pos_tag_list = []
for pos, count in tag_fd.most_common():
    pos_tag_list.append(pos)
#print(pos_tag_list)

['NN', 'DT', 'IN', 'JJ', 'RB', 'NNS', 'VBZ', 'PRP', 'CC', 'VB', 'VBP', 'VBD', 'TO', 'VBN', 'VBG', 'PRP$', 'CD', 'MD', 'WP', 'WRB', 'WDT', 'RP', 'JJS', 'JJR', 'EX', 'RBR', 'PDT', 'RBS', 'FW', 'NNP', 'WP$', 'UH', '$', 'NNPS', "''", 'SYM', 'POS', '``']


In [26]:
# get counts of words with POS tags
word_tag_fd = nltk.FreqDist(tag_tokens)
#word_tag_fd.most_common(10)
# word_tag_fd is a tuple ((word, POS), count)

[(('the', 'DT'), 667709),
 (('and', 'CC'), 324361),
 (('a', 'DT'), 322891),
 (('of', 'IN'), 289376),
 (('to', 'TO'), 268076),
 (('is', 'VBZ'), 211052),
 (('it', 'PRP'), 190801),
 (('in', 'IN'), 186728),
 (('this', 'DT'), 150916),
 (('was', 'VBD'), 95584)]

In [46]:
# get the top word of each part of speech tag
# empty dictionary
pos_word_dict = {}
# iterate through each part of speech
for pos in pos_tag_list:
    # keep only the most common word for each POS tag - create a list and keep the first element
    # first element is most common because .most_common() is sorted by count
    # word_tag_fd is a tuple ((word, POS), count)
    # word = word_tag[0], pos = word_tag[1]
    pos_word_dict[pos] = [word_tag[0] for (word_tag, count) in word_tag_fd.most_common() if word_tag[1] == pos][0]
    #print([wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == pos][0], '---', pos)
pos_word_dict

{'NN': 'i',
 'DT': 'the',
 'IN': 'of',
 'JJ': 'i',
 'RB': 'not',
 'NNS': 'people',
 'VBZ': 'is',
 'PRP': 'it',
 'CC': 'and',
 'VB': 'be',
 'VBP': 'are',
 'VBD': 'was',
 'TO': 'to',
 'VBN': 'been',
 'VBG': 'being',
 'PRP$': 'his',
 'CD': 'one',
 'MD': 'can',
 'WP': 'who',
 'WRB': 'when',
 'WDT': 'that',
 'RP': 'up',
 'JJS': 'best',
 'JJR': 'more',
 'EX': 'there',
 'RBR': 'more',
 'PDT': 'all',
 'RBS': 'most',
 'FW': 'etc',
 'NNP': 'x',
 'WP$': 'whose',
 'UH': 'oh',
 '$': 'zombi',
 'NNPS': 'republicans',
 "''": 'marry',
 'SYM': 'b',
 'POS': 's',
 '``': 'neighbour'}

## Bigram Text Prediction

In [47]:
# function to get the conditional probability of word2 given word1
# P(word2|word1)
def build_bigram_model():
    bigram_model = defaultdict(default_dict) # create a model mold
    # collect all bigrams for (w1, w2)
    for word1, word2 in ngrams(tokens, 2):
        # increase the count (frequency of tokens)
        bigram_model[word1][word2] += 1
    # compute the probability P(word2|word1)
    for word1 in bigram_model:
        # get total count of bigrams with word1
        total_count = float(sum(bigram_model[word1].values()))
        for word2 in bigram_model[word1]:
            # number of bigrams (word1 word2)/total
            bigram_model[word1][word2] /= total_count
    return bigram_model

In [48]:
# build the model
bigram_model = build_bigram_model()

In [130]:
# function to predict the next word based on bigram model
def bigram_predict_next_word(first_word):
    if len(bigram_model[first_word]) == 0:
        # get POS of the input word
        token = tokenizer.tokenize(first_word)
        # get the part of speech [(word, POS)]
        pos = nltk.pos_tag(token)[0][1]
        # predict next word POS - returns a list
        pred_pos = bigram_predict_next_pos(pos)
        # get the most common word of predicted POS
        pred_words = pos_word_dict[pred_pos[0]]
    
    else:
        # tokenize user input
        user_tokens = tokenizer.tokenize(first_word)
        #print('input:', user_tokens[0])
        # get probabilities of next word given user input
        prob_list = bigram_model[user_tokens[0]].values()
        #print(prob_list)
        # find the max prob
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_words = [word for word, prob in bigram_model[user_tokens[0]].items() if prob == most_likely]
        # if there is more than one predicted word, return the most frequent in the corpus
        if len(pred_words) > 1:
            word_dict = {}
            # get the frequency counts of the predicted word
            for word in pred_words:
                word_dict[word] = fdist[word]
            # get the word with max freqency
            for word,count in word_dict.items():
                if count == max(word_dict.values()):
                    pred_words = word
            
    return pred_words
    

In [131]:
bigram_predict_next_word('snate')

'of'

## Trigram Text Prediction

In [57]:
def build_trigram_model():
    trigram_model = defaultdict(default_dict) # create a model mold
    # collect trigrams for word1, word2, word3
    for word1, word2, word3 in ngrams(tokens , 3):
        # increase the count 
        trigram_model[word1, word2][word3] += 1
        # compute the probability P(word1, word|word3)
    for word1_word2 in trigram_model:
        # get total count of trigrams with word1 word2
        total_count = float(sum(trigram_model[word1_word2].values()))
        for word3 in trigram_model[word1_word2]:
            # number of trigrams/total
            trigram_model[word1_word2][word3] /= total_count
    return trigram_model   

In [58]:
# define the model
trigram_model = build_trigram_model()

In [107]:
# function to predict next word with trigram model
def trigram_predict_next_word(two_words):
    # tokenize user input
    user_tokens = tokenizer.tokenize(two_words)
    #print(user_tokens)
    # if input is not in the vocabulary - use POS
    if len(trigram_model[user_tokens[0], user_tokens[1]]) == 0:
        # get POS of the input words
        tagged_words = nltk.pos_tag(user_tokens)
        pos1, pos2 = tagged_words[0][1], tagged_words[1][1]
        # predict next word POS - needs two arguments
        pred_pos = trigram_predict_next_pos(pos1, pos2)
        #print(pred_pos)
        # get the most common word of predicted POS
        pred_words = pos_word_dict[pred_pos[0]]
        
    else:
        # get probabilities of next word
        prob_list = trigram_model[user_tokens[0], user_tokens[1]].values()
        # find the max prob
        #print(prob_list)
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_words = [word for word, prob in trigram_model[user_tokens[0], user_tokens[1]].items() if prob == most_likely]
        # if there are more than one word in pred_words
        if len(pred_words) > 1:
            # create a dictionary to pair up words and counts
            word_dict = {}
            # iterate through all pred_words choices
            for word in pred_words:
                # add word:count to dictionary
                word_dict[word] = fdist[word]
            for word,count in word_dict.items():
                # find the word that has the highest count - that is the word to return
                if count == max(word_dict.values()):
                    pred_words = word
    return pred_words

In [136]:
trigram_predict_next_word('hey you')

'can'