# Text classification (sentiment analysis)
Task: Predict sentiment of Amazon reviews
Dataset: Beans from TFDS

## 1. Loading dataset & basic preprocessing
- removal of reviews shorter than 5 characters
- mapping from 1-5 -> 0,1,2
- subsampling - without replacement, random state 42, 80 000 rows

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from IPython.display import display
import re
import matplotlib.pyplot as plt
import nltk
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers
import datetime

In [2]:
df = pd.read_csv('datasets/amazon_reviews_us_Major_Appliances_v1_00.tsv', sep='\t', on_bad_lines='skip')

In [3]:
df.shape

(96834, 15)

In [4]:
# remove nas and duplicate reviews
df.dropna(axis=0, subset=['review_body'], inplace=True)
df.drop_duplicates(subset=['review_body'], inplace=True)

In [5]:
df.shape

(93446, 15)

In [6]:
stopword_list = nltk.corpus.stopwords.words("english")

In [7]:
def remove_tags(review):
    return re.sub(pattern='<.*?>', string=review , repl=' ') 

def keep_alnum(review):
    return re.sub(pattern='[^A-Za-z\d\s:]', string=review, repl=' ')

def strip_spaces(review):
    return re.sub(pattern='[\s]{2,}', string=review, repl=' ')

def lowercase(review):
    return review.lower()

def remove_stopwords(review):
    review_list = review.split()
    return " ".join([word for word in review_list if word not in stopword_list])

In [8]:
df['review_body'] = df['review_body'].apply(remove_tags)        # remove html tags
df['review_body'] = df['review_body'].apply(keep_alnum)         # remove sub unicode char
df['review_body'] = df['review_body'].apply(strip_spaces)       # strip all unnecessary whitespaces
df['review_body'] = df['review_body'].apply(lowercase)          # put everything into lowercase
df['review_body'] = df['review_body'].apply(remove_stopwords)   # put everything into lowercase
df = df[df['review_body'].str.len() > 5]                        # keep only reviews longer than 5 characters

In [9]:
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,16199106,R203HPW78Z7N4K,B0067WNSZY,633038551,"FGGF3032MW Gallery Series 30"" Wide Freestandin...",Major Appliances,5,0,0,N,Y,"If you need a new stove, this is a winner.",great stove wonderful replacement sort antique...,2015-08-31
1,US,16374060,R2EAIGVLEALSP3,B002QSXK60,811766671,Best Hand Clothes Wringer,Major Appliances,5,1,1,N,Y,Five Stars,worked great,2015-08-31
2,US,15322085,R1K1CD73HHLILA,B00EC452R6,345562728,Supco SET184 Thermal Cutoff Kit,Major Appliances,5,0,0,N,Y,Fast Shipping,part exactly needed saved purchasing,2015-08-31
3,US,32004835,R2KZBMOFRMYOPO,B00MVVIF2G,563052763,Midea WHS-160RB1 Compact Single Reversible Doo...,Major Appliances,5,1,1,N,Y,Five Stars,love refrigerator keeps everything cold recommend,2015-08-31
4,US,25414497,R6BIZOZY6UD01,B00IY7BNUW,874236579,Avalon Bay Portable Ice Maker,Major Appliances,5,0,0,N,Y,Five Stars,running store ice works perfectly,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96829,US,37431087,R3CYIDM3UEY5PA,B00005O64S,222987122,Haier HDT18PA Space Saver Compact Countertop D...,Major Appliances,4,37,43,N,N,Pretty good dishwasher for small apartment,pretty good dishwasher price good job cleaning...,2002-07-14
96830,US,44686434,R1PLFLGSA6N9WU,B00005O64T,802734810,Haier America HSE02-WNAWW 1.8-Cubic-Foot Capac...,Major Appliances,1,33,39,N,N,Does not last long,bought office extremely dissatisfied stopped w...,2002-06-03
96831,US,36739731,RBPARLMOY6ZU5,B00005O64S,222987122,Haier HDT18PA Space Saver Compact Countertop D...,Major Appliances,5,6,45,N,N,Rave review for space saver,saw small dishwasher thought wonderful idea sm...,2002-05-05
96832,US,50744080,RSS5TDZOGUEB6,B00004SACT,344802997,Sanyo Two-Door 2.9 Cubic Foot Refrigerator,Major Appliances,4,71,71,N,N,Sanyo compact refrigerator,probably best small refrigerator market true f...,2000-09-29


In [10]:
df.loc[df['star_rating'] < 3, 'sentiment'] = 0
df.loc[df['star_rating'] == 3, 'sentiment'] = 1
df.loc[df['star_rating'] > 3, 'sentiment'] = 2
df.drop('star_rating', axis=1, inplace=True)
df = resample(df, n_samples=80000, random_state=42, replace=False)

In [11]:
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,sentiment
71963,US,23022807,R3IZQ5QBR0C7B7,B0039V7JFG,184993957,LG 3.6 CF FRONT LOAD WASHER DRYER COMBO,Major Appliances,5,14,N,N,lg small washer/dryer set,usually rate things hate exactly feel small lg...,2013-04-06,0.0
15508,US,21831333,R2LGGHUB944XMT,B00EJPHJVK,516641616,Fantech Lint Trap for Dryer Booster - DBLT4W (...,Major Appliances,0,0,N,Y,very hard to open must be securely mounted no ...,hard open must securely mounted provisions,2015-04-16,1.0
3110,US,11941986,R21U3YZPS3MBXG,B00HH2YZT6,73366442,Samsung RF32FMQDBSR 4-Door Refrigerator with C...,Major Appliances,6,6,N,N,15 month useful life - this should be disconti...,15 months stopped working threw groceries call...,2015-08-07,0.0
88804,US,46476694,R2JZ0YSJ5QEZX4,B001VKY8GU,232040651,Koolatron Coca Cola Personal Cube Fridge,Major Appliances,0,0,N,N,Unexpectedly Awesome,silver version tiny cooler fridge company chri...,2011-06-28,2.0
18295,US,48338607,R2QENB1J5LBM14,B00MZH0OPC,394118467,( 2 PACK ) 3392519 - DRYER THERMAL FUSE for Wh...,Major Appliances,0,0,N,Y,Perfect,exactly needed get dryer back,2015-03-22,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62973,US,31634986,R173X6QPB1N8SY,B004HXDLJ8,693470227,Whynter BWR-18SD 18 Bottle Built-In Wine Refri...,Major Appliances,11,12,N,Y,Very Happy,purchased replace space 12 34 garbage compacto...,2013-09-28,2.0
45682,US,52949439,R4YB9KY26NLPP,B003GFAY52,647457047,Broan 30W in. QP2 Under Cabinet Range Hood,Major Appliances,9,9,N,Y,Up until it quit working I though I had made a...,bought item professionally installed middle ap...,2014-07-07,0.0
84940,US,42829199,R1W9QEUT7MIYGD,B004WP4BAO,480751909,Samsung DV5451AGW,Major Appliances,4,4,N,N,Does NOT dry clothes. So not much of a dryer.,purchased nice looking supposedly good name br...,2012-02-18,0.0
1873,US,19293947,R12LFU2ZVPAZSS,B0125S2K0M,504103070,Avalon Top Loading Water Cooler Dispenser - Ho...,Major Appliances,144,156,N,N,Avalon water dispenser beat my many previous w...,videoid:8829556f67d2453e377e6459465db27e first...,2015-08-16,2.0


## 2. Final dataset
- 80 000 instances
- NEGATIVE 21 334
- NEUTRAL 5 674
- POSITIVE 52 992 
- 90:10 train:test split

In [12]:
df['sentiment'].value_counts()

sentiment
2.0    52992
0.0    21334
1.0     5674
Name: count, dtype: int64

In [13]:
# Train-test split
df_multimodal = df
df = df[['review_body', 'sentiment']]

X_train, X_test, y_train, y_test = train_test_split(
    df['review_body'], df['sentiment'], random_state=42, test_size=0.1, stratify=df['sentiment']
)
#Train-val split
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, stratify=y_train, random_state=42
)


In [14]:
y_train.value_counts()

sentiment
2.0    42924
0.0    17280
1.0     4596
Name: count, dtype: int64

In [15]:
y_test.value_counts()

sentiment
2.0    5299
0.0    2134
1.0     567
Name: count, dtype: int64

In [16]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
validation_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

### 3. Recurrent Neural Network

In [22]:
max_length = 100
max_tokens = 8000

text_vectorization = TextVectorization(
    ngrams=1,
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length
)

text_only_train_ds = train_dataset.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

In [23]:
# Define a function to vectorize the texts
def vectorize_text(text, label):
    return text_vectorization(text), label

# Apply the vectorization to the training, validation, and test datasets
train_ds = train_dataset.map(vectorize_text).cache().shuffle(10000).batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = validation_dataset.map(vectorize_text).cache().batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)
test_ds = test_dataset.map(vectorize_text).cache().batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)

In [26]:
inputs = keras.Input(shape=(max_length,), dtype="int64")
one_hot = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(16))(one_hot)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()



callbacks = [
    keras.callbacks.ModelCheckpoint("ann8.tf", save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta = 0.002, restore_best_weights=True)
]


log_dir = f"{dir}" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

callbacks = [
    keras.callbacks.ModelCheckpoint("ann8.tf", save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta = 0.002, restore_best_weights=True),
    tf.keras.callbacks.TensorBoard(log_dir=log_dir)
]

history = model.fit(
    train_ds.cache(),
    epochs=20,
    validation_data=val_ds.cache(),
    callbacks=[callbacks]
)

ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```
