In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import keras
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#  amazon dataset converted to csv beforehand from --> https://www.tensorflow.org/datasets/catalog/amazon_us_reviews#amazon_us_reviewstoys_v1_00
#  currently in google drive of this notebook
filepathTOY = '/content/drive/MyDrive/TOYS.csv'
df = pd.read_csv(filepathTOY)
df['data/review_body']=df['data/review_body'].astype(str)
df = df.drop('Unnamed: 0', 1)
df

  


Unnamed: 0,data/review_body,data/star_rating
0,Great game...a bit more complex than the Ameri...,4
1,love it,5
2,In theory is a great toy. It just needs to be ...,3
3,Not bad for beginners but would def spend the ...,3
4,It's entertained two grandchildren in the tub ...,5
...,...,...
4864244,My 10 year-old son got this product for Christ...,1
4864245,I've got a lot of Robbie The Robot collectable...,5
4864246,This is the best gun out there. A friend of m...,5
4864247,all star wars fans love this,5


In [None]:
df['data/star_rating'].value_counts()

5    3076922
4     769722
1     399503
3     387722
2     230380
Name: data/star_rating, dtype: int64

In [None]:
reviews = df['data/review_body']
labels =  df["data/star_rating"].subtract(1)  #labels [1-5] --> [0-4]
labels = labels.div(4)                        #labels [0-4] --> [0-1]
del df                                        


In [None]:
from sklearn.model_selection import train_test_split
# 4,864,249 data points, we sliced %89 of it because of RAM limitations. We contuniue with remaining 535,067 data points ≅≅ half a million
reviews, temp_reviews, labels, temp_labels = train_test_split(reviews, labels, test_size=.89, random_state = 10)
del temp_reviews        #discarded reviews
del temp_labels         #discarded labels
labels.value_counts()

1.00    338375
0.75     84970
0.00     43904
0.50     42291
0.25     25527
Name: data/star_rating, dtype: int64

In [None]:
# most 10,000 frequent words in reviews are converted into tokens and the remaining rare ones are discarded
t = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
t.fit_on_texts(reviews)
encoded_reviews = t.texts_to_sequences(reviews)

print(encoded_reviews)
del reviews           

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
#reviews shorter than 300 words are post-padded with 0s and longer ones are sliced to first 300 words
encoded_reviews = tf.keras.preprocessing.sequence.pad_sequences(encoded_reviews,padding='post',maxlen=300)

print(np.shape(encoded_reviews))
print(encoded_reviews)

(535067, 300)
[[  11  169    8 ...    0    0    0]
 [  30  167    9 ...    0    0    0]
 [   6   66    9 ...    0    0    0]
 ...
 [  11   68  664 ...    0    0    0]
 [ 384   19    0 ...    0    0    0]
 [ 107 1962  114 ...    0    0    0]]


In [None]:
#CROSS-VALIDATION
drive_path = '/content/drive/MyDrive/'

# 5 folds
skf = sklearn.model_selection.KFold(n_splits=5)
# watched metrics
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error']

# model
def create_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(input_dim=10000, output_dim=128, input_length=300),
      tf.keras.layers.Bidirectional(keras.layers.LSTM(64,return_sequences=True)),
      tf.keras.layers.Bidirectional(keras.layers.LSTM(32)),
      tf.keras.layers.Dense(24, activation='relu'),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  model.compile(loss='mse',
                optimizer='adam',
                metrics=['mae','mse',tf.keras.metrics.RootMeanSquaredError()])
  
  return model

#wrapping the sklearn estimator as tensorflow model
model = tf.keras.wrappers.scikit_learn.KerasRegressor(build_fn=create_model, epochs=3, batch_size=64, verbose=1)

#cross validation
scores = sklearn.model_selection.cross_validate(model, encoded_reviews, labels, cv=skf, scoring=scoring)



Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
sorted(scores.keys())
print(scores['test_neg_mean_absolute_error'])
print(scores['test_neg_root_mean_squared_error'])
print(scores['test_neg_mean_squared_error'])

[-0.10080609 -0.098251   -0.09891606 -0.09853102 -0.10052706]
[-0.16456634 -0.16179759 -0.16228921 -0.16281483 -0.16222908]
[-0.02708208 -0.02617846 -0.02633779 -0.02650867 -0.02631827]
