In [1]:
import os
import tensorflow as tf
import numpy as np
import matplotlib
import json
import string

In [2]:
os.environ['TF_FORCE_GPU_ALLOW_GROWTH']='true'

In [3]:
def encode_seq(data, length):
    data = [dictionary.get(word) for word in data if word != '']
    data = np.append(data, [-1 for i in range(length-len(data))])
    return data

def build_dictionary(data, index):
    for line in data:
        for word in line:
            index = word2vec(word, index)
    return index

def word2vec(data, index):
    if(data == ''):
        return index
    dictionary[data] = dictionary.get(data, index)
    return (index+1) if dictionary[data] == index else index

In [20]:
dictionary = dict()
dictIndex = 0
maxLength = 0

dataX, dataY, evalX, evalY = [], [], [], []

In [21]:
   # Read in the data
    
for line in open('data.json', 'r'):
    data = json.loads(line)
    dataX.append(data['reviewText'])
    dataY.append(data['overall'])

for line in open('eval.json', 'r'):
    data = json.loads(line)
    evalX.append(data['reviewText'])
    evalY.append(data['overall'])
        
dataX = np.array(dataX)
dataY = np.array(dataY)
evalX = np.array(evalX)
evalY = np.array(evalY)

In [22]:
    
# Remove punctuation and combine data
dataX = np.char.lower(dataX)
evalX = np.char.lower(evalX)

dataX = [s.translate(str.maketrans('', '', string.punctuation)) for s in dataX]
evalX = [s.translate(str.maketrans('', '', string.punctuation)) for s in evalX]

dataX = [s.split(' ') for s in dataX]
evalX = [s.split(' ') for s in evalX]

dataX = np.array(dataX)
evalX = np.array(evalX)

combData = np.append(dataX, evalX)

In [23]:
# Determine max length and build dictionary
maxLength = int(np.max(np.array([len(s) for s in combData])))
dictIndex = build_dictionary(dataX, dictIndex)
dictIndex = build_dictionary(evalX, dictIndex)

In [24]:
# Encode data for use
dataX = np.array([encode_seq(line, maxLength) for line in dataX])
evalX = np.array([encode_seq(line, maxLength) for line in evalX])

dataY = tf.keras.utils.to_categorical(dataY)
evalY = tf.keras.utils.to_categorical(evalY)

In [26]:
# Compile the model
model = tf.keras.Sequential()

model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(maxLength,)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(512, activation='tanh'))
model.add(tf.keras.layers.Dense(evalY.shape[1], activation='softmax'))

model.compile(loss=tf.keras.losses.categorical_crossentropy,
            optimizer=tf.keras.optimizers.Nadam(),
            metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               1385472   
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_3 (Dense)              (None, 512)               66048     
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 3

In [28]:
# Training

batch_size = 100
epochs = 1500
val_split = 0.2

history = model.fit(dataX, dataY,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=val_split,
                    verbose=0)

model.evaluate(evalX, evalY)



[1.1082302331924438, 0.609000027179718]

In [31]:
# Visual Evaluation 

test = evalX[0:20]
out = model.predict(evalX)

for i in range(20):
        print("Actual:", np.argmax(evalY[i]))
        print("Net:",np.argmax(out[i]))

Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 4
Net: 5
Actual: 4
Net: 5
Actual: 5
Net: 5
Actual: 4
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 4
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
Actual: 4
Net: 5
Actual: 5
Net: 5
Actual: 5
Net: 5
