# **Libraries & Constants**

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from keras.models import Sequential, load_model
from keras.layers import Dense, Input, Softmax, BatchNormalization, Dropout, Conv2D, MaxPool2D, Embedding, Reshape, Flatten, LSTM, GRU, SimpleRNN, Concatenate, concatenate, RepeatVector, TimeDistributed, Bidirectional
# from keras.layers.merge import concatenate
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax, Nadam, SGD
from tensorflow.keras import Model
from tensorflow.keras.optimizers.schedules import InverseTimeDecay, ExponentialDecay
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences #from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
import os
import string
from google.colab import drive
import json
import pickle

from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.porter import PorterStemmer
import spacy
import nltk
nltk.download('stopwords')
import re
# from transformers import AutoTokenizer,BertTokenizer,TFBertModel

import copy

# **Load Dataset and declare paths**

In [None]:
drive.mount('/content/drive')
basePath = '/content/drive/MyDrive/AI Projects/Sentiment-Analysis/'
dataPath = basePath + 'amazon_reviews_us_Watches_v1_00.csv'

# **Preprocessing**

## **Tokenization, Stemming, Removing Stop words and Punctuation**

In [6]:
dataset = pd.read_csv(dataPath)

  dataset = pd.read_csv(dataPath)


In [7]:
useless_features = ['review_date', 'verified_purchase'] + list(dataset.columns[8:11]) + list(dataset.columns[0:7]) + list(dataset.columns[-7:])
print(useless_features)

dataset.drop(columns=useless_features, inplace=True)
# delete rows with nan elements
dataset = dataset.dropna()

['review_date', 'verified_purchase', 'helpful_votes', 'total_votes', 'vine', 'marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21']


In [8]:
star_rating = np.array(dataset['star_rating'])
review_headline = dataset['review_headline'].tolist()
review_body = dataset['review_body'].tolist()

In [9]:
rev_class = np.empty_like(star_rating)
for i  in range(len(rev_class)):
  if star_rating[i] >= 4:
    rev_class[i] = 1  # class1 : Pos Class
  elif star_rating[i] <= 2:
    rev_class[i] = 2  # class2 : Neg class
  elif star_rating[i] == 3:
    rev_class[i] = 3  # class3 : Neutral class

dataset['rev_class'] = rev_class

dataset.head()

Unnamed: 0,star_rating,review_headline,review_body,rev_class
0,5,Five Stars,Absolutely love this watch! Get compliments al...,1
1,5,I love thiswatch it keeps time wonderfully,I love this watch it keeps time wonderfully.,1
2,2,Two Stars,Scratches,2
3,5,Five Stars,"It works well on me. However, I found cheaper ...",1
4,4,"Beautiful face, but cheap sounding links",Beautiful watch face. The band looks nice all...,1


In [10]:
rev_class.shape

(960082,)

In [20]:
def w_tokenizer(text):
  tokenizer = WhitespaceTokenizer()
  tokenized_list = tokenizer.tokenize(text)
  return tokenized_list

def stemmer_porter(text_list):
  porter = PorterStemmer()
  return_list = []
  for i in range(len(text_list)):
      return_list.append(porter.stem(text_list[i]))
  return(return_list)


def remove_stopwords(text_list):
  NLTK_stopwords = nltk.corpus.stopwords.words('english')
  return_list = []
  for i in range(len(text_list)):
      if text_list[i] not in NLTK_stopwords:
          return_list.append(text_list[i])
  return return_list

def remove_punc(text):
    if isinstance((text), (str)):
        text = re.sub('<[^>]*>', '', text)
        text = re.sub('[\W]+', '', text.lower())
        return text
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            temp_text = re.sub('<[^>]*>', '', text[i])
            temp_text = re.sub('[\W]+', '', temp_text.lower())
            return_list.append(temp_text)
        return(return_list)
    else:
        pass

def preprocess_sentence(sentence):
  return remove_stopwords(stemmer_porter(remove_punc(w_tokenizer(sentence))))

In [21]:
print(review_body[1])
print(preprocess_sentence(review_body[1]))

I love this watch it keeps time wonderfully.
['love', 'thi', 'watch', 'keep', 'time', 'wonder']


In [22]:
preprocessed_review_headline = [preprocess_sentence(x) for x in review_headline]
preprocessed_review_body = [preprocess_sentence(x) for x in review_body]


# calculate maximum number of unique words in review_body and review headline
preprocessed_review_headline_flat = [item for sublist in preprocessed_review_headline for item in sublist]
preprocessed_review_body_flat = [item for sublist in preprocessed_review_body for item in sublist]
numberOfUniqueWords = np.unique(preprocessed_review_headline_flat + preprocessed_review_body_flat).shape[0]

In [23]:
# Keras Tokenizer
filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
num_words = 20000

sentenceTokenizer = Tokenizer(num_words=num_words, filters=filters, split=' ', oov_token='UNK')
sentenceTokenizer.fit_on_texts(preprocessed_review_headline)
sentenceTokenizer.fit_on_texts(preprocessed_review_body)

review_headline_encoded = sentenceTokenizer.texts_to_sequences(preprocessed_review_headline)
review_body_encoded = sentenceTokenizer.texts_to_sequences(preprocessed_review_body)


# save encoded file
encoded_file = open(basePath+'review_headline_encoded.pkl', "wb")
pickle.dump(review_headline_encoded, encoded_file)
encoded_file.close()

encoded_file = open(basePath+'review_body_encoded.pkl', "wb")
pickle.dump(review_body_encoded, encoded_file)
encoded_file.close()

In [25]:
preprocessed_review_headline[4], review_headline_encoded[4]

(['beauti', 'face', 'cheap', 'sound', 'link'], [32, 33, 93, 669, 116])

## **Encoding review_body and review_headline**

In [None]:
# loading encoded file
encoded_file = open(basePath+'review_headline_encoded.pkl', "rb")
review_headline_encoded = pickle.load(encoded_file)
encoded_file.close()

encoded_file = open(basePath+'review_body_encoded.pkl', "rb")
review_body_encoded = pickle.load(encoded_file)
encoded_file.close()