In [53]:
import pandas as pd
import numpy as np

#
import string
import re

#
import matplotlib.pyplot as plt
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

#
import tensorflow as tf

#
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.regularizers import l1, l2, l1_l2

#
from tensorflow import random
from numpy.random import seed

seed_val = 1314
random.set_seed(seed_val)
seed(seed_val)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [3]:
train.head()

Unnamed: 0,review_id,review,rating
0,0,Ga disappointed neat products .. Meletot Hilsn...,1
1,1,"Rdtanya replace broken glass, broken chargernya",1
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1
3,3,Sent a light blue suit goods ga want a refund,1
4,4,Pendants came with dents and scratches on its ...,1


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146811 entries, 0 to 146810
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   review_id  146811 non-null  int64 
 1   review     146811 non-null  object
 2   rating     146811 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


In [5]:
test.head()

Unnamed: 0,review_id,review
0,0,slow delivery
1,1,Dateng goods do not conform pesanan😔
2,2,PSN k its 20 other DTG
3,3,I am expected that it have a frame and painted...
4,4,The product quality is not good.


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62918 entries, 0 to 62917
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review_id  62918 non-null  int64 
 1   review     62918 non-null  object
dtypes: int64(1), object(1)
memory usage: 983.2+ KB


In [13]:
train['rating'].iplot(xTitle='Sentiment', yTitle='Reviews', kind='hist')

Text pre-processing

In [19]:
train.review[0]

'Ga disappointed neat products .. Meletot Hilsnyaa Speed \u200b\u200bof delivery is good.'

In [23]:
pd.set_option('display.max_colwidth', -1)

def clean(text):
    text = text.lower()
    text = re.sub('[^\w\s]', '', text) #This removes all the punctuations
    text = re.sub(r'\n',' ', text) #This relaces the \n with space
    text = re.sub(r'\r','', text) #\r
    text = re.sub('[^0-9a-z #+_]', '', text) #Special chars
    
    return text


Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.



In [24]:
train['review'] = train['review'].apply(clean)

In [32]:
test['review'] = test['review'].apply(clean)

In [39]:
max(train.review.apply(len))

1236

Split into test/val set

In [7]:
sentence = train['review'].tolist()
label = train['rating'].tolist()

In [46]:
size = int(len(sentence) * 0.8)

train_sen = sentence[0:size]
val_sen = sentence[:size]

train_label = label[0:size]
val_label = label[:size]

Tokenize + Pad + Sequence

In [47]:
vocab_size = 10000
max_len = 1500
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

In [48]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sen)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_sen)
training_padded = pad_sequences(training_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_sen)
val_padded = pad_sequences(val_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [49]:
train_labels = np.array(train_label)
val_labels = np.array(val_label)

Build Model

Attention Layer

In [57]:
from tensorflow.keras import backend as K
#from keras.engine.topology import Layer
#from keras import initializations
from tensorflow.keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim
        

In [70]:
bs = 32
embedding_dim = 16

tokens_input = Input(shape=(max_len,))
print(tokens_input)

# Embedding layer 
embedded_sequences = Embedding(vocab_size, embedding_dim, input_length=max_len, trainable=False)(tokens_input)

x = Bidirectional(GRU(32, return_sequences=True, dropout=0.2, recurrent_dropout=0.15))(embedded_sequences)

x = Attention(max_len)(x)

x = Dense(16, activation='relu')(x)
x = Dropout(0.2)(x)

x = Dense(8, activation='relu')(x)
x = Dropout(0.2)(x)

final = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[tokens_input], outputs=[final], name='Attention')
model.summary()

Tensor("input_4:0", shape=(None, 1500), dtype=float32)
Model: "Attention"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 1500)]            0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 1500, 16)          160000    
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 1500, 64)          9600      
_________________________________________________________________
attention_8 (Attention)      (None, 64)                1564      
_________________________________________________________________
dense_18 (Dense)             (None, 16)                1040      
_________________________________________________________________
dropout_10 (Dropout)         (None, 16)                0         
_________________________________________________________________
de

Compile

In [71]:
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

In [72]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)

earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=9)

attention = model.fit(training_padded, train_labels, epochs=10, batch_size=bs,
                    callbacks=[earlystop, reduce_lr], validation_data=(val_padded, val_labels), verbose=1)

Epoch 1/10
   9/3671 [..............................] - ETA: 50:17 - loss: 0.6800 - accuracy: 0.0764

KeyboardInterrupt: 

GRU

In [74]:
bs = 32
embedding_dim = 16

tokens_input = Input(shape=(max_len,))
print(tokens_input)

# Embedding layer 
embedded_sequences = Embedding(vocab_size, embedding_dim, input_length=max_len, trainable=False)(tokens_input)

x = GRU(32, return_sequences=True, dropout=0.2, recurrent_dropout=0.15)(embedded_sequences)

x = Dense(16, activation='relu')(x)
x = Dropout(0.2)(x)

x = Dense(8, activation='relu')(x)
x = Dropout(0.2)(x)

final = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[tokens_input], outputs=[final], name='Attention')
model.summary()

Tensor("input_6:0", shape=(None, 1500), dtype=float32)
Model: "Attention"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 1500)]            0         
_________________________________________________________________
embedding_15 (Embedding)     (None, 1500, 16)          160000    
_________________________________________________________________
gru_14 (GRU)                 (None, 1500, 32)          4800      
_________________________________________________________________
dense_24 (Dense)             (None, 1500, 16)          528       
_________________________________________________________________
dropout_14 (Dropout)         (None, 1500, 16)          0         
_________________________________________________________________
dense_25 (Dense)             (None, 1500, 8)           136       
_________________________________________________________________
dr

In [75]:
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

In [76]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)

earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=9)

attention = model.fit(training_padded, train_labels, epochs=10, batch_size=bs,
                    callbacks=[earlystop, reduce_lr], validation_data=(val_padded, val_labels), verbose=1)

Epoch 1/10
  19/3671 [..............................] - ETA: 35:23 - loss: 0.5167 - accuracy: 0.1197

KeyboardInterrupt: 

Plot

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")

# list all data in history
print(history.attention.keys())

plt.title('Attention')

# summarize history for accuracy
plt.plot(history.attention['accuracy'])
plt.plot(history.attention['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.attention['loss'])
plt.plot(history.attention['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
ascore = model1.evaluate(x_test, y_test)

In [None]:
yhat4 = model4.predict(te_data, verbose=0)

In [None]:
# Use the model to predict a review   
fake_reviews = ['I love this phone', 'I hate spaghetti', 
                'Everything was cold',
                'Everything was hot exactly as I wanted', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
                'does not work when I stand on my head']

print(fake_reviews) 

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)           

classes = model.predict(fakes_padded)