In [290]:
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import string
import re
from nltk.stem import PorterStemmer
from keras.layers import TextVectorization,MultiHeadAttention,Dense,Embedding,Dropout,Input,LayerNormalization,Flatten

In [291]:
def positional_embedding(emb_dim=256, sequence_length=64):
    """positional embedding for each input token"""
    output = []
    for pos in range(sequence_length):
        PE = np.zeros(emb_dim, dtype="float32");
        for i in range(emb_dim):
            if i % 2 == 0:
                PE[i] = np.sin(pos / 10000 ** (i / emb_dim))
            else:
                PE[i] = np.cos(pos / 10000 ** ((i - 1) / emb_dim))
        output.append(tf.expand_dims(PE, axis=0))
    out = tf.concat(output, axis=0)
    out = tf.expand_dims(out, axis=0)
    return out

In [292]:
class Embeddings(keras.layers.Layer):
    def __init__(self, vocab_size, emb_dim, sequence_length):
        super(Embeddings, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.sequence_length = sequence_length
        self.token_embeddings = keras.layers.Embedding(input_dim=vocab_size, output_dim=emb_dim)

    def call(self, inputs, *args, **kwargs):
        x = self.token_embeddings(inputs)
        y = positional_embedding(emb_dim=self.emb_dim, sequence_length=self.sequence_length)
        return x + y

    def compute_mask1(self, input):
        return tf.math.not_equal(input, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "emb_dim": self.emb_dim,
            "sequence_length": self.sequence_length
        })
        return config

In [293]:
class TransformerEncoder(keras.layers.Layer):
    def __init__(self, num_heads, dense_dim, emd_dim):
        super(TransformerEncoder, self).__init__()
        self.num_heads = num_heads
        self.dense_dim = dense_dim
        self.emd_dim = emd_dim

        self.attention1 = MultiHeadAttention(num_heads=self.num_heads, key_dim=self.emd_dim)
        self.layernorm1 = LayerNormalization()
        self.layernorm2 = LayerNormalization()
        self.linear_projection = keras.models.Sequential(
            [
                keras.layers.Dense(units=self.dense_dim, activation="relu"),
                keras.layers.Dense(units=self.emd_dim)
            ]
        )

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = tf.cast(mask[:, newaxis, :], dtype="int32")
            T = tf.shape(mask)[2]
            mask = tf.repeat(mask, T, axis=1)

        attention_output1 = self.attention1(query=inputs, key=inputs, value=inputs, attention_mask=mask)

        norm1 = self.layernorm1(attention_output1 + inputs)
        linear_proj = self.linear_projection(norm1)
        return self.layernorm2(linear_proj + norm1)

    def get_config(self):
        config = super().get_config()
        config.update({
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
            "emd_dim": self.emd_dim
        })
        return config

In [294]:
VOCAB_SIZE = 10000
EMBEDDING_DIM = 256
num_heads = 2
num_layers = 1
dense_dim = 1024
SEQUENCE_LENGTH = 250
BATCH_SIZE = 64

In [295]:
data = pd.read_csv("G:\Ajay\dataset\IMDB Movie dataset\IMDB Dataset.csv")[:10000]
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [296]:
def cleanup_text(text):
    pc = PorterStemmer()
    text=text.translate(str.maketrans('', '',string.punctuation))
    words = [pc.stem(word.lower()) for word in text.split() if word not in stop_words]    
    return " ".join(words)

def change_sentiment(sentiment):
    if sentiment == "positive":
       return 1
    else:
        return 0

In [297]:
data["review"]=data["review"].apply(lambda x:cleanup_text(x))
data["sentiment"]=data["sentiment"].apply(lambda x:change_sentiment(x))

In [298]:
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,a wonder littl product br br the film techniqu...,1
2,i thought wonder way spend time hot summer wee...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [299]:
data["sentiment"].value_counts()

1    5028
0    4972
Name: sentiment, dtype: int64

In [300]:
type(data.iloc[0][0])

str

In [301]:
vectorizer_layer = TextVectorization(
    max_tokens = VOCAB_SIZE,
    standardize='lower_and_strip_punctuation',
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH,
    name="vectorizer_layer"
)

In [302]:
vectorizer_layer.adapt(data["review"])

In [303]:
def vectorizer(review):
    return vectorizer_layer(review)

In [304]:
X = data.apply(lambda x : vectorizer(x["review"]),axis=1)

In [305]:
Y = data["sentiment"]

In [306]:
type(X),type(Y)

(pandas.core.series.Series, pandas.core.series.Series)

In [307]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=0,stratify=Y)

In [308]:
type(x_train),x_train

(pandas.core.series.Series,
 5073    (tf.Tensor(2, shape=(), dtype=int64), tf.Tenso...
 3656    (tf.Tensor(6296, shape=(), dtype=int64), tf.Te...
 2737    (tf.Tensor(10, shape=(), dtype=int64), tf.Tens...
 1211    (tf.Tensor(7, shape=(), dtype=int64), tf.Tenso...
 3882    (tf.Tensor(2, shape=(), dtype=int64), tf.Tenso...
                               ...                        
 5557    (tf.Tensor(1, shape=(), dtype=int64), tf.Tenso...
 838     (tf.Tensor(902, shape=(), dtype=int64), tf.Ten...
 7680    (tf.Tensor(3585, shape=(), dtype=int64), tf.Te...
 3114    (tf.Tensor(10, shape=(), dtype=int64), tf.Tens...
 6720    (tf.Tensor(9, shape=(), dtype=int64), tf.Tenso...
 Length: 8000, dtype: object)

## Build Model

In [317]:
encoder_input = keras.layers.Input(shape=(None,))
emb = Embeddings(VOCAB_SIZE, EMBEDDING_DIM, sequence_length=SEQUENCE_LENGTH)
x = emb(encoder_input)
enc_mask = emb.compute_mask1(encoder_input)

for i in range(num_layers):
    x = TransformerEncoder(num_heads, dense_dim, EMBEDDING_DIM)(x)

x = Flatten()(x)
output = keras.layers.Dense(1, activation="softmax")(x)
model = keras.models.Model(inputs=encoder_input, outputs=output)

In [310]:
checkpoint_dir = "./sentiment_anaysis_with_transformer.h5"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_dir,
    monitor="val_accuracy",
    save_best_only=True,
    mode="max"
)

In [311]:
x_train = np.asarray(x_train)
type(x_train),x_train.shape

(numpy.ndarray, (8000,))

In [312]:
from sklearn.utils import shuffle
x_train,y_train = shuffle(x_train,y_train)
x_test,y_test = shuffle(x_test,y_test)

In [313]:
tf_x_train = []
for i in x_train:
    tf_x_train.append(i)
tf_x_train=tf.convert_to_tensor(tf_x_train)

tf_y_train = tf.convert_to_tensor(y_train.values)
tf_y_test  = tf.convert_to_tensor(y_test.values)

tf_x_test = []
for i in x_test:
    tf_x_test.append(i)
tf_x_test=tf.convert_to_tensor(tf_x_test)


In [None]:
adamOpt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=adamOpt, loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=["accuracy"])
history=model.fit(tf_x_train,tf_y_train,validation_data=(tf_x_test,tf_y_test),epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20