In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
from google.colab import drive

In [3]:
# Connect to Google Drive
drive.mount('/content/gdrive')
google_dir = '/content/gdrive/MyDrive/ANLP'

Mounted at /content/gdrive


In [4]:
# Copy data file to Colab instance to avoid quota issues with Google Drive
!cp "/content/gdrive/MyDrive/ANLP/Reviews.csv" "/content/"

In [5]:
# Load and check data
data = pd.read_csv("/content/Reviews.csv")
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
# Filter out reviews with helpfulness votes
helpful_df = data.loc[data['HelpfulnessDenominator'] >= 10].copy()

# Create Target Variable
helpful_df['helpful_score'] = helpful_df['HelpfulnessNumerator'] / helpful_df['HelpfulnessDenominator']

In [13]:
px.histogram(helpful_df, x='helpful_score')

In [7]:
import spacy
import spacy.cli

In [8]:
# Using the large model over the standard version to get vectors. Requires additional install.
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [9]:
# Apply spacy's nlp function to text. 
# Note: approx. 14mins load time. 
helpful_df['nlp'] = helpful_df['Text'].apply(nlp)

In [10]:
# Create dataframe of vectorised text.
vectors = helpful_df['nlp'].apply(lambda x: x.vector)
vec_df = vectors.apply(pd.Series)
vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
32,-1.323802,1.032652,-2.337208,-0.111059,3.76683,-0.041592,0.173712,3.87851,-0.715135,-0.504095,...,0.677222,-1.280813,1.062335,-0.518964,-1.577349,0.981302,0.494414,-0.455574,-2.148609,0.903731
33,-1.340788,1.367154,-2.432186,0.159615,3.363936,-0.144899,0.814567,3.691298,-1.419114,-0.034321,...,0.70832,-0.879604,1.487295,-0.799887,-1.704227,0.51126,0.882618,0.044586,-2.574825,1.159596
82,-1.105859,0.835509,-2.696097,-0.247309,1.856147,-0.053594,0.384191,4.065519,-2.479806,0.900214,...,0.581942,-0.722913,0.525695,-1.452901,-2.035355,0.041754,0.817126,0.323921,-3.517361,1.079162
158,-1.37346,0.747225,-1.2656,-0.159294,2.669593,0.045321,1.209111,3.1696,-2.023155,-0.155882,...,1.271905,0.149229,1.511957,-0.618358,-1.302749,1.035043,1.441601,-0.496323,-2.476612,0.582859
213,-1.334758,1.621257,-3.277381,-0.617078,3.131202,0.339972,-0.001275,3.923903,-1.103019,0.307257,...,0.962377,-0.649345,0.305418,-0.671857,-1.519091,0.11125,0.235039,-0.734187,-3.506289,1.445702


In [11]:
# Set up TPU for training
import tensorflow as tf

In [13]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Running on TPU  ['10.114.121.58:8470']


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(vec_df, helpful_df['helpful_score'], test_size=0.2, random_state=99)

In [19]:
from tensorflow.keras.layers import add, Bidirectional, Dense, Dropout, GRU
from tensorflow.keras.models import Sequential

In [22]:
# Create BiGRU model

# Parameters
optimizer_bigru = tf.keras.optimizers.Adam(learning_rate=0.001)
early_stop_bigru = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, restore_best_weights=True)
input_shape = (X_train.shape[1], 1)

# Architecture
with tpu_strategy.scope():
  layers = [Bidirectional(GRU(128, input_shape=input_shape, activation='relu', return_sequences=True), input_shape=input_shape),
            Dropout(0.2),
            Bidirectional(GRU(128, activation='relu')),
            Dropout(0.2),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='relu')]
  model_bigru = Sequential(layers)
  model_bigru.compile(loss='mean_squared_error', optimizer=optimizer_bigru, metrics=['mean_squared_error'])
model_bigru.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_6 (Bidirectio  (None, 300, 256)         100608    
 nal)                                                            
                                                                 
 dropout_12 (Dropout)        (None, 300, 256)          0         
                                                                 
 bidirectional_7 (Bidirectio  (None, 256)              296448    
 nal)                                                            
                                                                 
 dropout_13 (Dropout)        (None, 256)               0         
                                                                 
 dense_9 (Dense)             (None, 64)                16448     
                                                                 
 dropout_14 (Dropout)        (None, 64)               

In [23]:
# Fit Model 
# WARNING: 50 epochs takes approx. 30mins runtime with Google Colab TPU. Do not run on CPU. 
history_bigru = model_bigru.fit(X_train, y_train, 
                                validation_data=(X_test, y_test), 
                                epochs=50, batch_size=64, 
                                callbacks=[early_stop_bigru])

# Save model
model_bigru.save('/content/gdrive/MyDrive/models/anlp_bi.h5')

# Save history
np.save('/content/gdrive/MyDrive/models/anlp_bi.npy', history_bigru)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [25]:
# Continue training if early stopping did not halt model
history_bigru_cont = model_bigru.fit(X_train, y_train,
                                     validation_data=(X_test, y_test), 
                                     epochs=100, batch_size=64,
                                     callbacks=[early_stop_bigru])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [26]:
# Save model
model_bigru.save('/content/gdrive/MyDrive/models/anlp_bi2.h5')

# Save history
np.save('/content/gdrive/MyDrive/models/anlp_bi2.npy', history_bigru_cont)

In [38]:
# Remodel

# Parameters
optimizer_remodel = tf.keras.optimizers.Adam(learning_rate=0.001)
early_stop_remodel = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, restore_best_weights=True)
reduce_lr_remodel = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.00001, verbose=2)
input_shape = (X_train.shape[1], 1)

# Architecture
with tpu_strategy.scope():
  layers = [Bidirectional(GRU(64, input_shape=input_shape, activation='relu', return_sequences=True), input_shape=input_shape),
            Dropout(0.2),
            Bidirectional(GRU(256, activation='relu', return_sequences=True)),
            Dropout(0.2),
            Bidirectional(GRU(256, activation='relu')),
            Dropout(0.2),
            Dense(256, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='relu')]
  model_remodel = Sequential(layers)
  model_remodel.compile(loss='mean_squared_error', optimizer=optimizer_remodel, metrics=['mean_squared_error'])
model_remodel.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_31 (Bidirecti  (None, 300, 128)         25728     
 onal)                                                           
                                                                 
 dropout_49 (Dropout)        (None, 300, 128)          0         
                                                                 
 bidirectional_32 (Bidirecti  (None, 300, 512)         592896    
 onal)                                                           
                                                                 
 dropout_50 (Dropout)        (None, 300, 512)          0         
                                                                 
 bidirectional_33 (Bidirecti  (None, 512)              1182720   
 onal)                                                           
                                                     

In [36]:
history_remodel = model_remodel.fit(X_train, y_train,
                                    validation_data=(X_test, y_test), 
                                    epochs=10, batch_size=64,
                                    callbacks=[early_stop_remodel, reduce_lr_remodel])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [39]:
# Increase epochs
history_remodel = model_remodel.fit(X_train, y_train,
                                    validation_data=(X_test, y_test), 
                                    epochs=100, batch_size=64,
                                    callbacks=[early_stop_remodel, reduce_lr_remodel])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 17: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 22: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 37: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 43: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Ep

In [13]:
from tensorflow.keras.layers import Dense, Flatten, Dropout, Input, Embedding, LSTM, GRU, Bidirectional, add
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2

In [25]:
# LSTM

# Parameters
optimizer = Adam(0.001)

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, restore_best_weights=True)
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001, verbose=2)

# Architecture
model = Sequential()
# model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length)) 
# Note - input data uses embeddings from spacy, hence skipping the embedding layer within the NN
model.add(LSTM(128, input_shape=(X_train.shape[1], 1), activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='relu'))
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_squared_error'])

In [26]:
# Fit Model
lstm = model.fit(X_train, y_train, 
                 validation_data=(X_test, y_test), 
                 epochs=50, batch_size=64,
                 callbacks=[early_stop],
                 use_multiprocessing=True,
                 workers=6)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


In [35]:
# Save model
model.save('/content/gdrive/MyDrive/models/anlp_lstm.h5')

# Save history
np.save('/content/gdrive/MyDrive/models/anlp_lstm.npy', lstm)

In [29]:
# Parameters
optimizer_gru = Adam(0.001)
early_stop_gru = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, restore_best_weights=True)

# Architecture
model_gru = Sequential()
model_gru.add(GRU(128, input_shape=(X_train.shape[1], 1), activation='relu', return_sequences=True))
model_gru.add(Dropout(0.2))
model_gru.add(GRU(128, activation='relu'))
model_gru.add(Dropout(0.2))
model_gru.add(Dense(64, activation='relu'))
model_gru.add(Dropout(0.2))
model_gru.add(Dense(32, activation='relu'))
model_gru.add(Dropout(0.2))
model_gru.add(Dense(1, activation='relu'))
model_gru.compile(loss='mean_squared_error', optimizer=optimizer_gru, metrics=['mean_squared_error'])

In [36]:
# Fit Model
gru = model_gru.fit(X_train, y_train, 
                    validation_data=(X_test, y_test), 
                    epochs=50, batch_size=64,
                    callbacks=[early_stop_gru],
                    use_multiprocessing=True,
                    workers=6)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


In [37]:
# Save model
model_gru.save('/content/gdrive/MyDrive/models/anlp_gru.h5')

# Save history
np.save('/content/gdrive/MyDrive/models/anlp_gru.npy', gru)

In [15]:
# Bidirectional + GRU

# Parameters
optimizer_bi = Adam(0.001)
early_stop_bi = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, restore_best_weights=True)

# Architecture
model_bi = Sequential()
model_bi.add(Bidirectional(GRU(128, input_shape=(X_train.shape[1], 1), activation='relu', return_sequences=True),
                          input_shape=(X_train.shape[1], 1)))
model_bi.add(Dropout(0.2))
model_bi.add(Bidirectional(GRU(128, activation='relu')))
model_bi.add(Dropout(0.2))
model_bi.add(Dense(64, activation='relu'))
model_bi.add(Dropout(0.2))
model_bi.add(Dense(32, activation='relu'))
model_bi.add(Dropout(0.2))
model_bi.add(Dense(1, activation='relu'))
model_bi.compile(loss='mean_squared_error', optimizer=optimizer_bi, metrics=['mean_squared_error'])

In [None]:
# Fit Model
bi = model_bi.fit(X_train, y_train,
                  validation_data=(X_test, y_test), 
                  epochs=50, batch_size=64,
                  callbacks=[early_stop_bi],
                  use_multiprocessing=True,
                  workers=6)

# Save model
model_bi.save('/content/gdrive/MyDrive/models/anlp_bi.h5')

# Save history
np.save('/content/gdrive/MyDrive/models/anlp_bi.npy', bi)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
 20/313 [>.............................] - ETA: 13:33 - loss: 0.0496 - mean_squared_error: 0.0496

NameError: ignored