In [4]:
import numpy as np
import pandas as pd
import plotly.express as px

In [5]:
from google.colab import drive

In [6]:
# Connect to Google Drive
drive.mount('/content/gdrive')
google_dir = '/content/gdrive/MyDrive/ANLP'

Mounted at /content/gdrive


In [7]:
# Copy data file to Colab instance to avoid quota issues with Google Drive
!cp "/content/gdrive/MyDrive/ANLP/Reviews.csv" "/content/"

In [8]:
# Load and check data
data = pd.read_csv("/content/Reviews.csv")
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [9]:
# Filter out reviews with helpfulness votes
helpful_df = data.loc[data['HelpfulnessDenominator'] >= 10].copy()

# Create Target Variable
helpful_df['helpful_score'] = helpful_df['HelpfulnessNumerator'] / helpful_df['HelpfulnessDenominator']

In [10]:
# Convert to label for classification
helpful_df['helpful_score'] = np.where(helpful_df['helpful_score'] >= 0.8, 1, 0)
helpful_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,helpful_score
32,33,B001EO5QW8,AOVROBZ8BNTP7,S. Potter,19,19,4,1163376000,Best of the Instant Oatmeals,McCann's Instant Oatmeal is great if you must ...,1
33,34,B001EO5QW8,A3PMM0NFVEJGK9,"Megan ""Bad at Nicknames""",13,13,4,1166313600,Good Instant,This is a good instant oatmeal from the best o...,1
82,83,B003ZFRKGO,A2VOZX7YBT0D6D,"Johnnycakes ""Johnnycakes""",15,15,5,1325635200,Forget Molecular Gastronomy - this stuff rocke...,I know the product title says Molecular Gastro...,1
158,159,B000ITVLE2,A3NID9D9WMIV01,"Louie Arrighi ""Lou da Joo""",17,19,5,1260057600,tastes very fresh,"<span class=""tiny""> Length:: 0:26 Mins<br /><b...",1
213,214,B0009XLVGA,A1NHQNQ3TVXTZF,"Desert Girl ""chrissylovesherhusband""",3,10,1,1220227200,CHANGED FORMULA MAKES CATS SICK!!!!,"As with canidae, Felidae has also changed thei...",0


In [11]:
import spacy
import spacy.cli

In [12]:
# Using the large model over the standard version to get vectors. Requires additional install.
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [13]:
# Apply spacy's nlp function to text. 
# Note: approx. 14mins load time. 
helpful_df['nlp'] = helpful_df['Text'].apply(nlp)

In [14]:
# Create dataframe of vectorised text.
vectors = helpful_df['nlp'].apply(lambda x: x.vector)
vec_df = vectors.apply(pd.Series)
vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
32,-1.323802,1.032652,-2.337208,-0.111059,3.76683,-0.041592,0.173712,3.87851,-0.715135,-0.504095,6.821686,0.671419,-3.210259,1.448479,0.94604,1.138279,1.090314,-0.362776,-0.585832,-2.392319,1.227578,-0.547442,-1.700264,-1.040152,-1.243329,-1.521293,-2.187235,-1.004288,-0.797042,1.756473,0.54773,-0.973817,-1.358144,-1.650358,-1.899574,0.209466,-0.451592,1.834457,1.899464,1.260023,...,-1.259828,4.635929,2.206722,2.675498,1.182052,0.379287,0.680859,1.654571,-4.040608,0.775131,2.62134,-1.087492,0.145141,-0.866788,0.327439,-1.280318,1.733215,-0.229024,-0.496044,2.09502,0.185248,-0.78827,0.125567,0.858253,3.029055,-0.892422,0.234885,1.083938,-1.670763,1.232402,0.677222,-1.280813,1.062335,-0.518964,-1.577349,0.981302,0.494414,-0.455574,-2.148609,0.903731
33,-1.340788,1.367154,-2.432186,0.159615,3.363936,-0.144899,0.814567,3.691298,-1.419114,-0.034321,6.347642,1.152521,-3.645756,1.715338,1.291463,0.043991,1.414632,-1.040479,-0.216113,-2.626247,0.523843,-0.010043,-0.843776,-1.263611,-0.619142,-1.57118,-2.437595,-0.664569,-1.327567,1.994807,1.611927,-1.193155,-1.064011,-2.166667,-1.334005,0.06555,0.055779,1.521659,2.44672,2.024306,...,-0.695887,4.200289,2.334938,2.665738,1.054603,0.317752,0.345939,1.617559,-4.228474,0.635725,2.033291,-0.785462,-0.217867,-0.837831,-0.227115,-1.581848,1.882767,-0.708194,-0.849401,1.734663,0.213024,-0.843041,0.21376,0.667129,2.813268,-1.060397,0.598082,1.192488,-1.699868,0.689113,0.70832,-0.879604,1.487295,-0.799887,-1.704227,0.51126,0.882618,0.044586,-2.574825,1.159596
82,-1.105859,0.835509,-2.696097,-0.247309,1.856147,-0.053594,0.384191,4.065519,-2.479806,0.900214,5.744823,1.591808,-3.536534,1.8481,1.593867,-0.284174,1.27276,-1.875831,-1.291954,-2.677112,1.189191,0.456339,-0.926521,-1.687548,-1.063437,-1.59996,-2.343125,-0.479117,-1.167389,1.452251,0.794457,-1.676026,-0.853468,-1.515631,-0.690861,-0.089525,0.008364,1.239329,2.485189,1.491956,...,-0.026996,4.711818,1.827735,2.757103,2.091994,-0.345337,0.479066,1.463259,-3.679106,0.703237,1.621652,-1.023482,-0.663748,-0.376884,-0.813136,-1.392176,1.399031,-1.134432,-0.523177,1.698136,-0.570428,-0.709891,0.495796,0.965876,2.556315,-0.832498,-0.4811,1.396338,-2.113331,0.968965,0.581942,-0.722913,0.525695,-1.452901,-2.035355,0.041754,0.817126,0.323921,-3.517361,1.079162
158,-1.37346,0.747225,-1.2656,-0.159294,2.669593,0.045321,1.209111,3.1696,-2.023155,-0.155882,4.968339,1.249633,-3.969251,0.773692,1.264198,0.472989,1.794931,-0.432815,-0.884058,-0.43437,0.661992,0.199351,-1.122678,0.300546,-1.905074,-1.416467,-1.696853,-1.086357,-0.755377,1.632948,0.73731,-0.848773,-0.652749,-0.921709,-0.13515,-0.834802,-1.014495,0.850796,2.733212,0.976385,...,0.243693,3.581789,2.834831,2.117241,1.317349,-0.134647,-0.204952,1.466174,-3.950817,0.065238,1.224981,-0.850976,-0.03611,-1.002332,0.033793,-0.823109,0.98609,-0.675296,-0.523905,1.640264,0.573491,-1.289775,-0.342641,0.80827,2.397375,-0.467226,1.012726,0.70102,-1.292014,0.11145,1.271905,0.149229,1.511957,-0.618358,-1.302749,1.035043,1.441601,-0.496323,-2.476612,0.582859
213,-1.334758,1.621257,-3.277381,-0.617078,3.131202,0.339972,-0.001275,3.923903,-1.103019,0.307257,6.14157,2.204481,-3.501877,1.749801,1.212692,-0.234007,1.003945,-2.36491,-1.858483,-3.014802,0.971603,-0.789706,-0.921547,-1.457055,-0.255474,-1.483295,-3.027544,-0.148319,-1.413745,1.737659,1.762387,-1.727458,-1.128665,-2.265805,-1.974384,-1.223073,-0.30658,1.176712,2.239058,1.466933,...,0.318554,5.029393,2.958174,2.073317,2.297361,-0.700526,0.568019,2.481175,-3.088524,1.123126,1.174182,-0.702311,-0.502268,-1.93258,0.242989,-1.240995,1.314936,-1.698866,-1.386699,1.899207,-0.49645,0.077758,0.821671,1.303991,2.718992,-1.078073,-0.244497,1.780511,-2.109499,0.187792,0.962377,-0.649345,0.305418,-0.671857,-1.519091,0.11125,0.235039,-0.734187,-3.506289,1.445702


In [15]:
# Set up TPU for training
import tensorflow as tf

In [16]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Running on TPU  ['10.94.44.202:8470']


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(vec_df, helpful_df['helpful_score'], test_size=0.2, random_state=99)

In [34]:
from tensorflow.keras.layers import add, Bidirectional, Dense, Dropout, GRU
from tensorflow.keras.models import Sequential

In [44]:
# Create BiGRU model

# Parameters
optimizer_bigru = tf.keras.optimizers.Adam(learning_rate=0.001)
early_stop_bigru = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, restore_best_weights=True)
input_shape = (X_train.shape[1], 1)
metrics = ['Accuracy', 'Precision', 'Recall', 'AUC']

# Architecture
with tpu_strategy.scope():
  layers = [Bidirectional(GRU(128, input_shape=input_shape, activation='relu', return_sequences=True), input_shape=input_shape),
            Dropout(0.2),
            Bidirectional(GRU(128, activation='relu')),
            Dropout(0.2),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')]
  model_bigru = Sequential(layers)
  model_bigru.compile(loss='binary_crossentropy', optimizer=optimizer_bigru, metrics=metrics)
model_bigru.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_8 (Bidirectio  (None, 300, 256)         100608    
 nal)                                                            
                                                                 
 dropout_16 (Dropout)        (None, 300, 256)          0         
                                                                 
 bidirectional_9 (Bidirectio  (None, 256)              296448    
 nal)                                                            
                                                                 
 dropout_17 (Dropout)        (None, 256)               0         
                                                                 
 dense_12 (Dense)            (None, 64)                16448     
                                                                 
 dropout_18 (Dropout)        (None, 64)               

In [45]:
# Fit Model 
# Runtime approx. 23mins with Google Colab TPU. Do not run on CPU. 
history_bigru = model_bigru.fit(X_train, y_train, 
                                validation_data=(X_test, y_test), 
                                epochs=100, batch_size=64, 
                                callbacks=[early_stop_bigru])

# Save model
model_bigru.save('/content/gdrive/MyDrive/models/anlp_bi_class.h5')

# Save history
np.save('/content/gdrive/MyDrive/models/anlp_bi_class.npy', history_bigru)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100


In [32]:
neg, pos = np.bincount(y_train)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))


Examples:
    Total: 19985
    Positive: 13079 (65.44% of total)



In [33]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 1.45
Weight for class 1: 0.76


In [40]:
from tensorflow.keras.metrics import Accuracy, Precision, Recall, AUC

In [42]:
# Create BiGRU model

# Parameters
optimizer_imbal = tf.keras.optimizers.Adam(learning_rate=0.001)
early_stop_imbal = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, restore_best_weights=True)
input_shape = (X_train.shape[1], 1)
metrics = ['Accuracy', 'Precision', 'Recall', 'AUC']

# Architecture
with tpu_strategy.scope():
  layers = [Bidirectional(GRU(128, input_shape=input_shape, activation='relu', return_sequences=True), input_shape=input_shape),
            Dropout(0.2),
            Bidirectional(GRU(128, activation='relu')),
            Dropout(0.2),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')]
  model_imbal = Sequential(layers)
  model_imbal.compile(loss='binary_crossentropy', optimizer=optimizer_imbal, metrics=metrics)
model_imbal.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_6 (Bidirectio  (None, 300, 256)         100608    
 nal)                                                            
                                                                 
 dropout_12 (Dropout)        (None, 300, 256)          0         
                                                                 
 bidirectional_7 (Bidirectio  (None, 256)              296448    
 nal)                                                            
                                                                 
 dropout_13 (Dropout)        (None, 256)               0         
                                                                 
 dense_9 (Dense)             (None, 64)                16448     
                                                                 
 dropout_14 (Dropout)        (None, 64)               

In [43]:
# Fit Model 
# Run time approx. 10mins
history_imbal = model_imbal.fit(X_train, y_train, 
                                validation_data=(X_test, y_test), 
                                epochs=100, batch_size=64, 
                                callbacks=[early_stop_imbal],
                                class_weight=class_weight)

# Save model
model_imbal.save('/content/gdrive/MyDrive/models/anlp_bi_class_imbal.h5')

# Save history
np.save('/content/gdrive/MyDrive/models/anlp_bi_class_imbal.npy', history_imbal)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
