In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
from google.protobuf.descriptor import Error
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.python.keras.backend import dtype
from tensorflow.python.keras.layers.merge import concatenate

import tensorflow_hub as hub

from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

from tensorflow.keras import layers, Model

import numpy as np

In [4]:
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


In [5]:
train_df = pd.read_csv('gdrive/MyDrive/datasets/amazon_reviews.txt', index_col=False, delimiter="\t")
features = ["REVIEW_TEXT", "RATING", "LABEL", "VERIFIED_PURCHASE"]
train_shortened_df = train_df[features]

train_shortened_df.head()

Unnamed: 0,REVIEW_TEXT,RATING,LABEL,VERIFIED_PURCHASE
0,"When least you think so, this product will sav...",4,0,N
1,Lithium batteries are something new introduced...,4,0,Y
2,I purchased this swing for my baby. She is 6 m...,3,0,N
3,I was looking for an inexpensive desk calcolat...,4,0,N
4,I only use it twice a week and the results are...,4,0,N


In [6]:
train_shortened_df['VERIFIED_PURCHASE'] = train_shortened_df['VERIFIED_PURCHASE'].replace('N', 0)
train_shortened_df['VERIFIED_PURCHASE'] = train_shortened_df['VERIFIED_PURCHASE'].replace('Y', 1)
train_shortened_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,REVIEW_TEXT,RATING,LABEL,VERIFIED_PURCHASE
0,"When least you think so, this product will sav...",4,0,0
1,Lithium batteries are something new introduced...,4,0,1
2,I purchased this swing for my baby. She is 6 m...,3,0,0
3,I was looking for an inexpensive desk calcolat...,4,0,0
4,I only use it twice a week and the results are...,4,0,0


In [7]:
review_array = train_shortened_df["REVIEW_TEXT"].to_numpy(dtype=object)
labels_array = train_shortened_df["LABEL"].to_numpy(np.int32)
verified_array = train_shortened_df["VERIFIED_PURCHASE"].to_numpy(np.float32)

In [8]:
print(review_array[0:1])

['When least you think so, this product will save the day. Just keep it around just in case you need it for something.']


In [9]:
# We first need to shuffle the data such that both training and validation dataset has both labels
data_length = len(review_array)
idx = np.random.permutation(data_length)
X = review_array[idx]
y = labels_array[idx]
verified = verified_array[idx]



In [10]:
train_ratio = 0.8
train_data_len = int( train_ratio * data_length )

# Input divided into train and validation
partial_x_train = X[:train_data_len]
#reshape array to be concatenated with the document embedding
verified_train = verified[:train_data_len].reshape((-1, 1))
#output divided into train and validation
partial_y_train = y[:train_data_len]

x_val = X[train_data_len:]
verified_val = verified[train_data_len:].reshape((-1, 1))
y_val = y[train_data_len:]

print("Partial_train:", partial_x_train.shape)
print("Verified_train:", verified_train.shape)
print("Partial_Y_train:", partial_y_train.shape)


print("X_val:", x_val.shape)
print("Verified_val:", verified_val.shape)
print("Y_val:", y_val.shape)






Partial_train: (16800,)
Verified_train: (16800, 1)
Partial_Y_train: (16800,)
X_val: (4200,)
Verified_val: (4200, 1)
Y_val: (4200,)


In [11]:
try:
    model_classifier = tf.keras.models.load_model('Models/fakeNewsDetector_Advanced.hdf5', custom_objects={'KerasLayer':hub.KerasLayer})
except OSError:
    model_classifier = None

if not model_classifier:
    # We define how the input layers to our model look like. One input is text and the other if the purchase is verified or not
    #This shape means input is a tf.tensor and its shape is (None,)
    input_text = layers.Input(shape=[], dtype=tf.string)
    #This is the shape of (None, 1)
    input_verified = layers.Input(shape=(1,), dtype=tf.float32)

    print(input_verified.shape)

    model = "https://tfhub.dev/google/nnlm-en-dim50/2"
    hub_layer = hub.KerasLayer(model, input_shape=[], dtype=tf.string, trainable=True)


    x = hub_layer(input_text)
    #array = np.ones( (x.shape[0], 1) )
    combined = concatenate([x, input_verified], axis=1)
    # Shape is (None, 51) which is the 50 embedding representation + 1 from verified_purchase!
    print(combined.shape)
    x = Dense(16, activation='relu')(combined)
    x = Dense(1, activation='sigmoid')(x)

    model_classifier = Model(inputs=[input_text, input_verified], outputs=x, name='FakeReviewDetector')

    #Sanity check if our model structure is correct
    model_classifier.summary()

    
    model_classifier.compile(optimizer='adam',
                    loss=tf.losses.BinaryCrossentropy(from_logits=True),
                    metrics=[tf.metrics.BinaryAccuracy(threshold=0.5, name='accuracy')])


    callbacks = [
        keras.callbacks.EarlyStopping(monitor="accuracy", patience=15,
                                        verbose=1, mode="min", restore_best_weights=True),
            keras.callbacks.ModelCheckpoint(filepath="Models/fakeNewsDetector_Advanced.hdf5", verbose=1, save_best_only=True)
        ]

    history = model_classifier.fit([partial_x_train, verified_train],
                            partial_y_train,
                            epochs=40,
                            batch_size=256,
                            validation_data=([x_val,verified_val], y_val),
                            verbose=1,
                            callbacks=callbacks
                            )

else:
    results = model_classifier.evaluate([X, verified], y)
    print(f'Accuracy on all data is: {results[1]}')



(None, 1)
(None, 51)
Model: "FakeReviewDetector"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        (None, 50)           48190600    input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 51)           0           keras_layer[0][0]                
                                                            

  '"`binary_crossentropy` received `from_logits=True`, but the `output`'



Epoch 00001: val_loss improved from inf to 0.58653, saving model to Models/fakeNewsDetector_Advanced.hdf5
Epoch 2/40

Epoch 00002: val_loss improved from 0.58653 to 0.51650, saving model to Models/fakeNewsDetector_Advanced.hdf5
Epoch 3/40

Epoch 00003: val_loss improved from 0.51650 to 0.47852, saving model to Models/fakeNewsDetector_Advanced.hdf5
Epoch 4/40

Epoch 00004: val_loss improved from 0.47852 to 0.46756, saving model to Models/fakeNewsDetector_Advanced.hdf5
Epoch 5/40

Epoch 00005: val_loss did not improve from 0.46756
Epoch 6/40

Epoch 00006: val_loss did not improve from 0.46756
Epoch 7/40

Epoch 00007: val_loss did not improve from 0.46756
Epoch 8/40

Epoch 00008: val_loss did not improve from 0.46756
Epoch 9/40

Epoch 00009: val_loss did not improve from 0.46756
Epoch 10/40

Epoch 00010: val_loss did not improve from 0.46756
Epoch 11/40

Epoch 00011: val_loss did not improve from 0.46756
Epoch 12/40

Epoch 00012: val_loss did not improve from 0.46756
Epoch 13/40

Epoch 0