In [16]:
import ast
import pandas as pd
import numpy as np
import time
from utils import *

from nltk.tokenize.treebank import TreebankWordDetokenizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten, Conv1D, Dropout, MaxPooling1D, GlobalMaxPooling1D, LSTM, GRU, SimpleRNN
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import tensorflow.random as random
from sklearn.model_selection import KFold

seed = 8
random.set_seed(seed)
np.random.seed(seed)

In [2]:
embeddings_dict = {}
with open("glove.42B.300d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        token = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[token] = vector

In [3]:
df = pd.read_csv("cleaned_data/cleaned_reviews_4.csv")
df["text"] = df["text"].apply(lambda x: ast.literal_eval(x))
df["detoken"] = df["text"].apply(lambda y: TreebankWordDetokenizer().tokenize(y))
df.head()

Unnamed: 0,review,text,rating,sentiment,detoken
0,"This started out strong, but it went downhill ...","[start, out, strong, go, downhill, fairly, qui...",1,0,start out strong go downhill fairly quickly no...
1,"A decently written YA book, but I can't even c...","[decently, write, ya, book, can, even, conside...",1,0,decently write ya book can even consider end o...
2,"Ugh...I tried, I honestly tried. I'm a huge fa...","[ugh, try, honestly, try, huge, fan, scott, we...",1,0,ugh try honestly try huge fan scott westerfeld...
3,I hate to give any book this low of a rating -...,"[hate, give, any, book, low, rat, know, take, ...",1,0,hate give any book low rat know take actually ...
4,Main points: \n 1. Never ever introduce a poin...,"[main, point, never, ever, introduce, point, v...",1,0,main point never ever introduce point view sid...


In [4]:
df_pos, df_neg = split_sentiment(df)
df_train, df_test = split_train_test(df_pos, df_neg, 333)

train_words = [word for text in df_train["text"] for word in text]
train_text_length = [len(text) for text in df_train["text"]]

vocab = list(set(train_words))
vocab_size = len(vocab)

test_words = [word for text in df_test["text"] for word in text]
test_vocab_size = len(list(set(test_words)))

print(f"There are {len(train_words)} train words in total and the vocabulary size is {vocab_size}.")
print(f"There are {len(test_words)} test words in total and the vocabulary size is {test_vocab_size}.")

There are 1218433 train words in total and the vocabulary size is 29796.
There are 303603 test words in total and the vocabulary size is 15661.


In [26]:
inputs = df["detoken"].tolist()
targets = np.asarray(df["sentiment"])

MAX_WORDS = 20000
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(inputs)

word_index = tokenizer.word_index

inputs_sequences = tokenizer.texts_to_sequences(inputs)

MAX_LENGTH = 50

inputs_padded = pad_sequences(inputs_sequences, maxlen=MAX_LENGTH, padding="post")

EMBEDDING_DIM = 300
vocab_size = min(len(word_index) + 1, MAX_WORDS)

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_WORDS:
        continue
    
    try:
        embedding_vector = embeddings_dict[word]
        embedding_matrix[i] = embedding_vector
    
    except KeyError:
        embedding_vector = np.zeros(EMBEDDING_DIM)
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LENGTH, trainable=False)

del(embedding_matrix)

In [27]:
kfold = KFold(n_splits=10, shuffle=True)
acc_per_fold = []
loss_per_fold = []

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs_padded, targets):

  # Define the model architecture
  model = Sequential()
  model.add(embedding_layer)
  model.add(SimpleRNN(32))
  model.add(Dense(16, activation="relu"))
  model.add(Dense(1, activation='sigmoid'))

  # Compile the model
  EPOCHS = 5
  LEARNING_RATE = 0.001
  BATCH_SIZE = 64
  optimizer = Adam(learning_rate=LEARNING_RATE)
  model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model.fit(inputs_padded[train], targets[train], batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=0, validation_split=0.3)

  # Generate generalization metrics
  scores = model.evaluate(inputs_padded[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

------------------------------------------------------------------------
Training for fold 1 ...
Score for fold 1: loss of 0.7832042574882507; accuracy of 49.78571534156799%
------------------------------------------------------------------------
Training for fold 2 ...
Score for fold 2: loss of 0.7770660519599915; accuracy of 50.428569316864014%
------------------------------------------------------------------------
Training for fold 3 ...
Score for fold 3: loss of 0.7798271179199219; accuracy of 55.642855167388916%
------------------------------------------------------------------------
Training for fold 4 ...
Score for fold 4: loss of 0.5693315267562866; accuracy of 73.21428656578064%
------------------------------------------------------------------------
Training for fold 5 ...
Score for fold 5: loss of 0.7021685838699341; accuracy of 65.4285728931427%
------------------------------------------------------------------------
Training for fold 6 ...
Score for fold 6: loss of 0.7152

In [28]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.7832042574882507 - Accuracy: 49.78571534156799%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.7770660519599915 - Accuracy: 50.428569316864014%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.7798271179199219 - Accuracy: 55.642855167388916%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.5693315267562866 - Accuracy: 73.21428656578064%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.7021685838699341 - Accuracy: 65.4285728931427%
------------------------------------------------------------------------
> Fold 6 - Loss: 0.7152639031410217 - Accuracy: 53.42857241630554%
-----------------------------------------------------------------------