In [1]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


# Packages Import

In [None]:
pip install -q -U keras-tuner

In [None]:
import os
import string
import datetime
import gensim
import pandas as pd
import numpy as np
import tensorflow as tf
import keras_tuner as kt
import tensorflow.keras as keras

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input,concatenate,Activation, Dense, Dropout, Embedding, Flatten,Bidirectional, LSTM
from keras.models import Model
from keras.initializers import Constant
from keras.optimizers import Adam

import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

# # Load the TensorBoard notebook extension
# %load_ext tensorboard
# # Clear any logs from previous runs on tensorboard
# !rm -rf ./logs/

In [None]:
# Changing the parameter of the column width for the display of the pandas dataframe
pd.set_option('display.max_colwidth',100)

In [None]:
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

#Dataset import and features selection

In [None]:
#Check if the dataset is pre-downloaded in our working directory, otherwise we download it using the url location where is available
if not os.path.isfile('gdrive/MyDrive/datasets/amazon_reviews.txt'):
    url = 'https://drive.google.com/uc?id=1-LYI_s6oZ0OTe3I0vFYRYIBVhjFswReY&export=download'
    print('Downloading DB to train')
    wget.download(url)
    print('Download Completed!\nUnzipping...')
    shutil.unpack_archive('amazon_reviews.zip')
else:
   print("The dataset is already downloaded")   
#The dataset is a tab seperated csv file therefore we define delimiter as delimiter = '/t'.
#We set index_col = False since we want to discard the entries with missing columns 
df = pd.read_csv('gdrive/MyDrive/datasets/amazon_reviews.txt', error_bad_lines=False, delimiter="\t")
df.head()

We do feature extraction by keeping those features that will help our model perform better. For that purpose we decided to keep only the columns that are listed below : 
 - **REVIEW_TEXT**: The comment description that is given as review for a product 
 - **RATING**: The actual rating integer number from 1 to 5 with 1 be the lowest and 5 the highest. 
 -**VERIFIED_PURCHASE**: A parameter which indicates of the user who comments is a verified on (with "Y") or is not (with "N"). Later on we are going to change those value with 1 and 0 respectively. 
 -**LABEL**: This is the parameter which indicated if the relation between the "RATING" and the "REVIEW_TEXT" has a mismatch, giving the value 1 if that happen, meaning that it is a fake one, or not giving the value 0.


In [None]:
features = ["REVIEW_TEXT", "RATING", "LABEL", "VERIFIED_PURCHASE"]
df = df[features]
df.head()

# Data pre-processing 

In that step with pre-process our data in order to bring them in a format that is more suitable for manipulation and it would be easier halded by our model.

 - Firstly,  we change our categorical feature "VERIFIED_PURCHASE" from "Y" and "N" to "1" and "0". This helps our model to work in a more efficient way and to "understand" better the "meaning" of this feature. 
 - Secondly, we pre-processed the "REVIE_TEXT" column by removing all the punction marks and to make all the word lower-case. By removing the punctuation we are able to avoid adding to our vocabulary words with excactly the same meaning. This will give more capacity to our embedding vocabulary.

In [None]:
# Changing the categorical values from "Y" and "N" to 1 and 0
df['VERIFIED_PURCHASE'] = pd.Categorical(df['VERIFIED_PURCHASE'])
df['VERIFIED_PURCHASE'] = df['VERIFIED_PURCHASE'].cat.codes

In [None]:
#We define a function that is going to make our word lower-case and then it will remove punctuations
def remove_punctuation(txt):
  text_lower = "".join([c.lower() for c in txt])
  txt_nonpunct = "".join([c for c in text_lower if c not in string.punctuation])
  return txt_nonpunct

In [None]:
#We call the function "remove_punctuation" to be applied on every entry of the "REVIEW_TEXT" of our dataframe
df['REVIEW_TEXT'] = df['REVIEW_TEXT'].apply(lambda x: remove_punctuation(x))

In [None]:
print(df.shape)

Our model consist of 21000 records
- Since this is the case we split our data to the format of train, validation and test in the split of 80%, 10%, 10% since we want to have an amount of data to train our model and our data are not offered in a great amount for deep learning purposes

In [None]:
# We want to split the data in 80:10:10 for train:valid:test dataset
train_size=0.8

X = df.copy()

# In the first step we will split the data in training and remaining dataset
X_train, X_rem= train_test_split(X, train_size=train_size)

# # Now since we want the valid and test size to be equal (10% each of overall data). 
# # we have to define valid_size=0.5 (that is 50% of remaining data)
test_size = 0.5
X_valid, X_test = train_test_split(X_rem, test_size=0.5)

print("Training: ", X_train.shape)
print("Validation: ", X_valid.shape)
print("Test: ", X_test.shape)

- We further seperate the columns "RATING", "VERIFIED" from the rest of our dataset since we want to train our model including also those features.  The "LABEL" column is our target column which we are going to use to check the accuracy perfomance of our model by comparing the model's predictions with the "really" ones.

In [None]:
# We seperate the "RATING" column from the rest of the dataset
train_rating = X_train.pop('RATING')
valid_rating = X_valid.pop('RATING')
test_rating = X_test.pop('RATING')

# We seperate the "VERIFIED_PURCHASE" column from the rest of the dataset
train_pursh = X_train.pop('VERIFIED_PURCHASE')
valid_pursh = X_valid.pop('VERIFIED_PURCHASE')
test_pursh = X_test.pop('VERIFIED_PURCHASE')

# We seperate the "LABEL" column from the rest of the dataset
train_target = X_train.pop('LABEL')
valid_target = X_valid.pop('LABEL')
test_target = X_test.pop('LABEL')

- Since we have our datasets prepaired in the sense that we have the train, validation and test sets we are now tokenize and pad the "REVIEW_TEXT" in order to create our vocabulary. We did not set any vocabulary size for our tokenization process. The padding max size is set to 100.

In [None]:
oov_token = "<OOV>"
max_length = 100
padding_type = "post"
trunction_type="post"

In [None]:
tokenizer = Tokenizer(oov_token=oov_token)
tokenizer.fit_on_texts(X_train.REVIEW_TEXT)
vocab_size = len(tokenizer.word_index)+1
print('Vocab Size is ',vocab_size)

In [None]:
#Tokenize and padding for the "REVIEW_TEXT" column of the training dataset
X_train_sequences = tokenizer.texts_to_sequences(X_train.REVIEW_TEXT)
X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, padding=padding_type, 
                       truncating=trunction_type)

#Tokenize and padding for the "REVIEW_TEXT" column of the validation dataset
X_val_sequences = tokenizer.texts_to_sequences(X_valid.REVIEW_TEXT)
X_val_padded = pad_sequences(X_val_sequences,maxlen=max_length, 
                               padding=padding_type, truncating=trunction_type)

#Tokenize and padding for the "REVIEW_TEXT" column of the test dataset
X_test_sequences = tokenizer.texts_to_sequences(X_test.REVIEW_TEXT)
X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length, 
                               padding=padding_type, truncating=trunction_type)

# Pre-Trained word embeddings

In order to prepare our model to reach better accuracy we decided to use word-embeddings which is going to help our model to understand the "meaning" of each word that is going going to appear during the training procedure. 

*The main goal of this notebook is to create a model and ckeck it's results under the use of Word2Vec embedding.* 

For that purpose we used the concept of transfer learning by getting a pre-trained Word2Vec embedding from the gensim library.

In [None]:
# We split each text of the "REVIEW_TEXT" column
documents = []
for _text in X_train.REVIEW_TEXT:    
    documents.append((_text.split(" ")))

In [None]:
len(documents)

In [None]:
#Set up the hyperparameters for our Word2Vec model
W2V_SIZE = 100
W2V_WINDOW = 7
W2V_EPOCH = 64
W2V_MIN_COUNT = 5

#We get the Word2Vec model from gensim
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT,
                                            workers=8)

#We build the vocabulary
w2v_model.build_vocab(documents)

In [None]:
#Then we train
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

In [None]:
# We test that the word embedding where created successfully by ckecking the most similar words of a given word
w2v_model.wv.most_similar("bad") 

In [None]:
#We create our embedding matrix
embedding_matrix = np.zeros((vocab_size , W2V_SIZE))
for word , i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

# The model


The model that we created is based on the concept that we want to make predictions based on different features of the dataset and to check the resutls based on them. The feautures that seemed to perform better giving a conceptual reason for that are "REVIEW_TEXT", "RATING", "VERIFIED_PURCHASE" given that we want to predict the "LABEL" feauture. 

So our model consists of 3 heads under the following architecture : 
1. The head that processes the "**REVIEW_TEXT**" feature, on which we have applied the Word2Vec word embedding layer as a 100 dimesional vector. Then we use Dropout as a technique to avoid overfitting our model. Then we use a Bi-directional LSTM in order to process the sequences and to be able to extract the "useful" information from the individual sentence. Then the output of the Bi-directional LSTM is pass to a fully connected layer which we are going to use later on the concatination step of all our feauture models

2. The second head processes the "**RATING**" feature using a sequence of dense fully contected which is brought in the same dimensionsion as our first head and thrid layer later on, something that is useful for the concatination process of our heads. 

3. The third layer follows the same logic as the second one. On this one we process the "**VERIFIED_PURCHASE**" feature again using a fully connected dense layer which is brought in the proper dimemsnions for the concatination procedure.

Since we have now this three heads in a fully connected dense layer format we concatinate them and we pass them to another fully connected dense layer in order to extract the relations between these features.

Then our model is compiled using the ADAM optimizer and as our loss function the binary crossentropy loss. 

In [None]:
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            max_length,
                            embeddings_initializer= Constant(embedding_matrix),
                            trainable=False)

In [None]:
review_branch_ip = Input(shape=(100,), name='Review_input')
review_branch = embedding_layer(review_branch_ip)
review_branch = Dropout(0.2)(review_branch)
review_branch = Bidirectional(
    LSTM(64, dropout=0.2,recurrent_dropout=0)
)(review_branch)
review_branch = Dense(64,activation='relu')(review_branch)
review_branch_op = Dense(16, activation='relu')(review_branch)


rating_branch_ip = Input(shape=(1,), name='Rating_input')
rating_branch = Dense(8,activation='relu')(rating_branch_ip)
rating_branch = Dropout(0.2)(rating_branch)
rating_branch_op = Dense(16,activation='relu')(rating_branch)




verified_purchase_branch_ip = Input(shape=(1,), name='Verified_Purchase_input')
verified_purchase_branch = Dense(8,activation='relu')(verified_purchase_branch_ip)
verified_purchase_branch = Dropout(0.2)(verified_purchase_branch)
verified_purchase_branch_op = Dense(16,activation='relu')(verified_purchase_branch)


concat = concatenate([review_branch_op, rating_branch_op, verified_purchase_branch_op], name='Concatenate')


final_op = Dense(8, activation='relu')(concat)
final_output = Dense(1,activation='sigmoid')(final_op)

model = Model(inputs=[review_branch_ip,rating_branch_ip,verified_purchase_branch_ip], outputs=final_output,
                  name='Final_output')

In [None]:
# We compile with Adam and binary crossentropy loss 
# clipvalue to avoid the gradient exploding
model.compile(optimizer=Adam(clipvalue=0.5) , 
              loss='binary_crossentropy', 
              metrics=['acc'])

In [None]:
#We display the architecture of our model and display the total parameters that are trainable or not
model.summary()

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# This is our first attempt to get the model prediction results having a batch_size of 32 and we train from 10 epochs
# We also add the stop early feature in order to avoid training our model while it does not improve
stop_early_model = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history = model.fit([X_train_padded,train_rating,train_pursh], train_target, batch_size=32,epochs=10, validation_data=([X_val_padded,valid_rating,valid_pursh], valid_target),callbacks=[stop_early_model])

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')

### Hyperparameters Tuning

Now that we have made a first attempt to get our model's first prediction results in the base of accuracy perfomance, we are going to apply some hyperparameter tuning strategies. For that reason we tested 2 hyperpameter tuning that are included in the "Keras tuner" package : 

1. The first is the **Bayesian Optimization** algorithm which provides a seacrhing strategies based on the logic training the model for the given amount of epochs.
2. The we checked also the **Hyperband Optimization** algorithm where we set the maximum amount of epochs and then it applies the hyperparameter tuning in a the range of the given maximun epochs

In [None]:
#We define a function which constinst of the model strucure and our hyperparameter search domain for our model
def model_builder(hp):

  hp_dropout = hp.Float('Dropout', min_value=0.1, max_value=0.5, step=0.1)
  hp_lstm_units = hp.Int('Lstm_units', min_value=8, max_value=64, step=8)
  hp_lstm_dropout = hp.Float('Lstm_Dropout', min_value=0.1, max_value=0.5, step=0.1)
  # hp_desns_units = hp.Int('Dense_units', min_value=4, max_value=24, step=2)
  # hp_learning_rate = hp.Float('Learning_rate', min_value=0.00001, max_value=0.0001, step=-10)


  review_branch_ip = Input(shape=(100,), name='Review_input')
  review_branch = embedding_layer(review_branch_ip)
  review_branch = Dropout(hp_dropout)(review_branch)
  review_branch = Bidirectional(
      LSTM(hp_lstm_units, dropout=hp_lstm_dropout,recurrent_dropout=0)
  )(review_branch)
  review_branch = Dense(hp_lstm_units,activation='relu')(review_branch)
  review_branch_op = Dense(16, activation='relu')(review_branch)
 


  rating_branch_ip = Input(shape=(1,), name='Rating_input')
  rating_branch = Dense(8,activation='relu')(rating_branch_ip)
  rating_branch = Dropout(hp_dropout)(rating_branch)
  rating_branch_op = Dense(16,activation='relu')(rating_branch)
  



  verified_purchase_branch_ip = Input(shape=(1,), name='Verified_Purchase_input')
  verified_purchase_branch = Dense(8,activation='relu')(verified_purchase_branch_ip)
  verified_purchase_branch = Dropout(hp_dropout)(verified_purchase_branch)
  verified_purchase_branch_op = Dense(16,activation='relu')(verified_purchase_branch)


  concat = concatenate([review_branch_op, rating_branch_op, verified_purchase_branch_op], name='Concatenate')


  final_op = Dense(8, activation='relu')(concat)
  final_output = Dense(1,activation='sigmoid')(final_op)

  model = tf.keras.Model(inputs=[review_branch_ip,rating_branch_ip,verified_purchase_branch_ip], outputs=final_output,
                      name='Final_output')

  model.compile(optimizer=Adam(clipvalue=0.5) , 
               loss='binary_crossentropy', 
               metrics=['acc'])





  return model

In [None]:
#Early stopping
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

<h5> Bayessian Hyperparameter Tunning </h5>

In [None]:
# This class sets up our Bayesian Tuner and then possibility to check our model based on different batch sizes
class MyTuner_Bayesian(kt.tuners.BayesianOptimization):
  def run_trial(self, trial, *args, **kwargs):
    
    kwargs['batch_size'] = trial.hyperparameters.Int('batch_size', min_value=8, max_value=64, step=8) 
    
    super(MyTuner_Bayesian, self).run_trial(trial, *args, **kwargs)
   

In [None]:
b_tuner = MyTuner_Bayesian(model_builder,
                objective='val_acc',
                max_trials = 30,
                directory='Bayesian_Tuner',
                project_name='Bayesian_Amazon_reviews_tuner')

In [None]:
b_tuner.search([X_train_padded,train_rating,train_pursh], train_target, validation_data=([X_val_padded,valid_rating,valid_pursh], valid_target) , callbacks=[stop_early], epochs = 10)

In [None]:
# Get the optimal hyperparameters
best_b_hps=b_tuner.get_best_hyperparameters(num_trials=1)[0]

#Display the best hyperparameter that perfomed better based on the Bayesian Tuner
print(f"""
The hyperparameter search is complete. 
Dropout: {best_b_hps.get('Dropout')} ,
Epochs: {best_b_hps.get('epochs')},
Batch_size: {best_b_hps.get('batch_size')},
Lstm_units: {best_b_hps.get('Lstm_units')} ,
Lstm_Dropout: {best_b_hps.get('Lstm_Dropout')}  
""")

<h5> Hyperband Hyperparameter Tunning </h5>

In [None]:
class MyTuner_Hyperband(kt.tuners.Hyperband):
  def run_trial(self, trial, *args, **kwargs):
    # You can add additional HyperParameters for preprocessing and custom training loops
    # via overriding `run_trial`
    kwargs['batch_size'] = trial.hyperparameters.Int('batch_size', min_value=8, max_value=64, step=8) #USE 16 
    # kwargs['epochs'] = trial.hyperparameters.Int('epochs', min_value=5, max_value=15, step=5) #Int('epochs', 10, 20)
    super(MyTuner_Hyperband, self).run_trial(trial, *args, **kwargs) 

In [None]:
h_tuner = MyTuner_Hyperband(model_builder,
                     objective='val_acc',
                     max_epochs=10,
                     factor=3,
                     directory='Hyperband_Tuner',
                     project_name='Hyperband_Amazon_reviews_tuner')

In [None]:
h_tuner.search([X_train_padded,train_rating,train_pursh], train_target, validation_data=([X_val_padded,valid_rating,valid_pursh], valid_target) , callbacks=[stop_early], epochs = 10)

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Get the optimal hyperparameters
best_h_hps=b_tuner.get_best_hyperparameters(num_trials=1)[0]


#Display the best hyperparameter that perfomed better based on the Bayesian Tuner
print(f"""
The hyperparameter search is complete. 
Dropout: {best_h_hps.get('Dropout')} ,
Epochs: {best_h_hps.get('epochs')},
Batch_size: {best_h_hps.get('batch_size')},
Lstm_units: {best_h_hps.get('Lstm_units')} ,
Lstm_Dropout: {best_h_hps.get('Lstm_Dropout')}  
""")

<h5>Final Model</h5>
We apply the best hyperparamets found from the two tuning processes and we evaluate the results

In [None]:
dropout = best_h_hps.get('Dropout')
lstm_units = best_h_hps.get('Lstm_units')
lstm_dropout = best_h_hps.get('Lstm_Dropout')
epochs = best_h_hps.get('epochs')
batch_size  = best_h_hps.get('batch_size')


In [None]:
review_branch_ip = Input(shape=(100,), name='Review_input')
review_branch = embedding_layer(review_branch_ip)
review_branch = Dropout(best_h_hps.get('Dropout'))(review_branch)
review_branch = Bidirectional(
    LSTM(64, dropout=0.2,recurrent_dropout=0)
)(review_branch)
review_branch = Dense(64,activation='relu')(review_branch)
review_branch_op = Dense(16, activation='relu')(review_branch)


rating_branch_ip = Input(shape=(1,), name='Rating_input')
rating_branch = Dense(8,activation='relu')(rating_branch_ip)
rating_branch = Dropout(0.2)(rating_branch)
rating_branch_op = Dense(16,activation='relu')(rating_branch)




verified_purchase_branch_ip = Input(shape=(1,), name='Verified_Purchase_input')
verified_purchase_branch = Dense(8,activation='relu')(verified_purchase_branch_ip)
verified_purchase_branch = Dropout(0.2)(verified_purchase_branch)
verified_purchase_branch_op = Dense(16,activation='relu')(verified_purchase_branch)


concat = concatenate([review_branch_op, rating_branch_op, verified_purchase_branch_op], name='Concatenate')


final_op = Dense(8, activation='relu')(concat)
final_output = Dense(1,activation='sigmoid')(final_op)

model = Model(inputs=[review_branch_ip,rating_branch_ip,verified_purchase_branch_ip], outputs=final_output,
                  name='Final_output')

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plot_graphs(history, 'acc')
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')