## 0. Imports

In [1]:
import nltk
import collections
import numpy as np
from keras.datasets import imdb
from nltk.util import ngrams

## 1. IMDB Dataset

In [2]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


## 2. Pre-Processing

### 2.0. Index to word and vice versa

In [3]:
word_index = imdb.get_word_index()
index_word = {i: word for word, i in word_index.items()}

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [4]:
def mapper(sources, map_dict):
    destinations = []
    for source in sources:
        destination = [map_dict.get(element) for element in source]
        destinations.append([d for d in destination if d is not None])
    return np.array(destinations, dtype=object)

In [5]:
x_train_words = mapper(x_train, index_word)
x_train_indexes = mapper(x_train_words, word_index)

x_test_words = mapper(x_test, index_word)
x_test_indexes = mapper(x_test_words, word_index)

In [6]:
print(x_train[0])
print(x_train_words[0])
print(x_train_indexes[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
['the', 'as', 'you', 'with', 'out', 

### 2.1. Any data cleaning

In [7]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alise\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
def cleaner(sentences, stop_words):
    new_sentences = []
    for sentence in sentences:
        new_sentence = [word for word in sentence if word not in stop_words and len(word) > 2]
        new_sentences.append(new_sentence)
    return new_sentences

In [9]:
x_train_words_cleaned = cleaner(x_train_words, stop_words)
x_train_indexes_cleaned = mapper(x_train_words_cleaned, word_index)

x_test_words_cleaned = cleaner(x_test_words, stop_words)
x_test_indexes_cleaned = mapper(x_test_words_cleaned, word_index)

In [10]:
print(x_train_words[0])
print(x_train[0])
print(x_train_indexes_cleaned[0])
print(x_train_words_cleaned[0])

['the', 'as', 'you', 'with', 'out', 'themselves', 'powerful', 'lets', 'loves', 'their', 'becomes', 'reaching', 'had', 'journalist', 'of', 'lot', 'from', 'anyone', 'to', 'have', 'after', 'out', 'atmosphere', 'never', 'more', 'room', 'titillate', 'it', 'so', 'heart', 'shows', 'to', 'years', 'of', 'every', 'never', 'going', 'villaronga', 'help', 'moments', 'or', 'of', 'every', 'chest', 'visual', 'movie', 'except', 'her', 'was', 'several', 'of', 'enough', 'more', 'with', 'is', 'now', 'current', 'film', 'as', 'you', 'of', 'mine', 'potentially', 'unfortunately', 'of', 'you', 'than', 'him', 'that', 'with', 'out', 'themselves', 'her', 'get', 'for', 'was', 'camp', 'of', 'you', 'movie', 'sometimes', 'movie', 'that', 'with', 'scary', 'but', 'pratfalls', 'to', 'story', 'wonderful', 'that', 'in', 'seeing', 'in', 'character', 'to', 'of', '70s', 'musicians', 'with', 'heart', 'had', 'shadows', 'they', 'of', 'here', 'that', 'with', 'her', 'serious', 'to', 'have', 'does', 'when', 'from', 'why', 'what', 

## 3. Build Models

In [11]:
def get_vocab_size(x, y):
    x_dict = {0: [], 1: []}
    for i, sample in enumerate(x):
        class_idx = y[i]
        x_dict[class_idx] += sample

    p_vocab_size = len(collections.Counter(x_dict[1]))
    n_vocab_size = len(collections.Counter(x_dict[0]))

    return n_vocab_size, p_vocab_size

In [12]:
def build_ngrams(x, y, n):
    ngrams_dict = {0: [], 1: []}

    for i, sample in enumerate(x):
        class_idx = y[i]
        sample_ngrams = ngrams(sample, n)
        ngrams_dict[class_idx].extend(sample_ngrams)

    p_ngrams_freq = collections.Counter(ngrams_dict[1])
    n_ngrams_freq = collections.Counter(ngrams_dict[0])

    return n_ngrams_freq, p_ngrams_freq

In [13]:
def get_model(freqs, vocab_size, before_freqs=None):
    denominator = sum(freqs.values())
    model = dict()
    for gram, freq in freqs.items():
        if before_freqs is not None:
            denominator = before_freqs[gram[:-1]]
        value = (freq + 1) / (denominator + vocab_size)
        model[gram] = value
    return model

In [14]:
vocab_size = get_vocab_size(
    x_train_indexes_cleaned,
    y_train
)

In [15]:
print(vocab_size[0])
print(vocab_size[1])

61337
63648


### 3.1. Uni-Gram

In [16]:
n = 1
unigram_freqs = build_ngrams(
    x_train_indexes_cleaned,
    y_train,
    n
)

In [17]:
unigram_model = {
    0: get_model(unigram_freqs[0], vocab_size[0]),
    1: get_model(unigram_freqs[1], vocab_size[1])
}

### 3.2. Bi-Gram


In [18]:
n = 2
bigram_freqs = build_ngrams(
    x_train_indexes_cleaned,
    y_train,
    n
)

In [19]:
bigram_model = {
    0: get_model(bigram_freqs[0], vocab_size[0], unigram_freqs[0]),
    1: get_model(bigram_freqs[1], vocab_size[1], unigram_freqs[1])
}

### 3.3. Tri-Gram

In [20]:
n = 3
trigram_freqs = build_ngrams(
    x_train_indexes_cleaned,
    y_train,
    n
)

In [21]:
trigram_model = {
    0: get_model(trigram_freqs[0], vocab_size[0], bigram_freqs[0]),
    1: get_model(trigram_freqs[1], vocab_size[1], bigram_freqs[1])
}

## 4. Evaluate Model

In [22]:
pos_count = np.count_nonzero(y_test)
neg_count = y_test.shape[0] - pos_count
neg_class_prob = neg_count / y_test.shape[0]

print(neg_class_prob)

0.5


In [23]:
def nb_predict(x_test, n, model, neg_class_prob, unk_factor, vocab_size):
    preds = []
    
    for sample in x_test:
        neg_prob = np.log(neg_class_prob)
        pos_prob = np.log(1 - neg_class_prob)

        ngrams_list = ngrams(sample, n)
        for gram in ngrams_list:
            gram_neg_prob = model[0].get(gram)
            gram_pos_prob = model[1].get(gram)
            
            # <UNK>
            if gram_neg_prob is None:
                gram_neg_prob = 1 / (unk_factor[0] + vocab_size[0])
            if gram_pos_prob is None:
                gram_pos_prob = 1 / (unk_factor[1] + vocab_size[1])

            gram_neg_prob = np.log(gram_neg_prob)
            gram_pos_prob = np.log(gram_pos_prob)

            neg_prob += gram_neg_prob
            pos_prob += gram_pos_prob

        if neg_prob > pos_prob:
            preds.append(0)
        else:
            preds.append(1)
    return np.array(preds)

In [24]:
def calculate_metrics(y_r, y_p):
    tp = 0
    tn = 0
    fn = 0
    fp = 0

    for i in range(y_test.shape[0]):
        if y_r[i] == 1:
            if y_p[i] == 1:
                tp += 1
            elif y_p[i] == 0:
                fn += 1
        elif y_r[i] == 0:
            if y_p[i] == 0:
                tn += 1
            elif y_p[i] == 1:
                fp += 1

    a = (tp + tn) / (tp + tn + fn + fp)
    p = tp / (tp + fp)
    r = tp / (tp + fn)
    f1_score = (2 * p * r) / (p + r)

    return {
        'accuracy': a,
        'precision': p,
        'recall': r,
        'f1_score': f1_score,
    }

### 4.1 Unigram

In [25]:
# Unigram
# With cleaning stop words

n = 1
model = unigram_model

preds = nb_predict(
    x_test_indexes_cleaned,
    n,
    model,
    neg_class_prob,
    vocab_size,
    vocab_size
)

calculate_metrics(y_test, preds)

{'accuracy': 0.813,
 'precision': 0.861164958921813,
 'recall': 0.74632,
 'f1_score': 0.7996399948570694}

In [26]:
for i in range(5):
    print(' '.join([str(e) for e in x_test_words[i]]))
    print(f"Predicted class: {y_test[i]} \t Real class: {preds[i]}")
    print()

the wonder own as by is sequence i i jars roses to of hollywood br of down shouting getting boring of ever it sadly sadly sadly i i was then does don't close faint after one carry as by are be favourites all family turn in does as three part in another some to be probably with world uncaring her an have faint beginning own as is sequence
Predicted class: 0 	 Real class: 0

the as you world's is quite br mankind most that quest are chase to being quickly of little it time hell to plot br of something long put are of every place this consequence council of interplay storytelling being nasty not of you warren in is failed club i i of films pay so sequences mightily film okay uses to received wackiness if time done for room sugar viewer as cartoon of gives to forgettable br be because many these of reflection sugar contained gives it wreck scene to more was two when had find as you another it of themselves probably who interplay storytelling if itself by br about 1950's films not would eff

### 4.2 Bigram

In [27]:
# Bigram
# With cleaning stop words

n = 2
model = bigram_model

preds = nb_predict(
    x_test_indexes_cleaned,
    n,
    model,
    neg_class_prob,
    vocab_size,
    vocab_size
)

calculate_metrics(y_test, preds)

{'accuracy': 0.80428,
 'precision': 0.8911858479893037,
 'recall': 0.6932,
 'f1_score': 0.7798227062052828}

In [28]:
for i in range(5):
    print(' '.join([str(e) for e in x_test_words[i]]))
    print(f"Predicted class: {y_test[i]} \t Real class: {preds[i]}")
    print()

the wonder own as by is sequence i i jars roses to of hollywood br of down shouting getting boring of ever it sadly sadly sadly i i was then does don't close faint after one carry as by are be favourites all family turn in does as three part in another some to be probably with world uncaring her an have faint beginning own as is sequence
Predicted class: 0 	 Real class: 0

the as you world's is quite br mankind most that quest are chase to being quickly of little it time hell to plot br of something long put are of every place this consequence council of interplay storytelling being nasty not of you warren in is failed club i i of films pay so sequences mightily film okay uses to received wackiness if time done for room sugar viewer as cartoon of gives to forgettable br be because many these of reflection sugar contained gives it wreck scene to more was two when had find as you another it of themselves probably who interplay storytelling if itself by br about 1950's films not would eff

### 4.3 Trigram

In [29]:
# Trigram
# With cleaning stop words

n = 3
model = trigram_model

preds = nb_predict(
    x_test_indexes_cleaned,
    n,
    model,
    neg_class_prob,
    vocab_size,
    vocab_size
)

calculate_metrics(y_test, preds)

{'accuracy': 0.62788,
 'precision': 0.8506251370914675,
 'recall': 0.31024,
 'f1_score': 0.45465736561345915}

In [30]:
for i in range(5):
    print(' '.join([str(e) for e in x_test_words[i]]))
    print(f"Predicted class: {y_test[i]} \t Real class: {preds[i]}")
    print()

the wonder own as by is sequence i i jars roses to of hollywood br of down shouting getting boring of ever it sadly sadly sadly i i was then does don't close faint after one carry as by are be favourites all family turn in does as three part in another some to be probably with world uncaring her an have faint beginning own as is sequence
Predicted class: 0 	 Real class: 0

the as you world's is quite br mankind most that quest are chase to being quickly of little it time hell to plot br of something long put are of every place this consequence council of interplay storytelling being nasty not of you warren in is failed club i i of films pay so sequences mightily film okay uses to received wackiness if time done for room sugar viewer as cartoon of gives to forgettable br be because many these of reflection sugar contained gives it wreck scene to more was two when had find as you another it of themselves probably who interplay storytelling if itself by br about 1950's films not would eff