In [1]:
# Bag of words without frameworks

In [7]:
vocab = {}
word_encoding = 1
def bag_of_words(text):
    global word_encoding

    words = text.lower().split(" ")
    bag = {}

    for word in words:
        if word in vocab:
            encoding = vocab[word]
        else:
            vocab[word] = word_encoding
            encoding = word_encoding
            word_encoding += 1

        if encoding in bag:
            bag[encoding] += 1
        else:
            bag[encoding] = 1

    return bag

text = "test to see if this bag bag of words words gets counted"
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 2, 7: 1, 8: 2, 9: 1, 10: 1}
{'test': 1, 'to': 2, 'see': 3, 'if': 4, 'this': 5, 'bag': 6, 'of': 7, 'words': 8, 'gets': 9, 'counted': 10}


# Working with movie reviews

In [12]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

np_load_old = np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)
np.load = np_load_old


In [18]:
# Testing lengths of reveiws
len(train_data[5])

43

In [19]:
# Length of everything passed into the nn must be the same. Must preprocess data to compensate for varying lengths

In [20]:
# If len > 250, we trim it off, and if less than 250, add padding with 0s to make it reach 250
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

In [23]:
# RE-Testing lengths of reveiws
len(train_data[5])

#Padding successful

250

In [27]:
### MODEL ###

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

#sigmoid to help classify < 0.5 as negative, > 0.5 positive

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          2834688   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


In [29]:
### TRAINING ###

In [30]:
model.compile(loss="binary_crossentropy", optimizer='rmsprop', metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.3976806696534157, 0.85764]


In [32]:
# 85% accuracy not good enough, but a good start

In [33]:
### MAKING PREDICTIONS ###

# requires preprocessing any input to the model

In [36]:
word_index = imdb.get_word_index()

def encode_text(text):
    tokens = keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "THat was quite a good movie, I'd say"
encoded = encode_text(text)

# turns any input text into the format that the model takes in

In [39]:
# a decoder function for any potential reverse use case

reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + " "
    return text[:-1]

# Testing decoder function on the text string i added above
print(decode_integers(encoded))

that was quite a good movie i'd say


In [74]:
# make predictions

def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result = model.predict(pred)
    print(str(result[0]))
    
review_text = "Extremely interesting"
result = predict(review_text)

[0.71187735]
