<h1><right>Dataset</center></h1>
 
#### **Dataset Download Reference Link : http://ai.stanford.edu/~amaas/data/sentiment/**
#### > Large Movie Review Dataset 
#### > This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. 
#### > This Dataset contains 25,000 highly polar movie reviews for training, and 25,000 for testing. 

#### Dowload the Dataset & Install other helper modules

In [1]:
# !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -O aclImdb_v1.tar.gz
# !tar -xvzf aclImdb_v1.tar.gz
# !wget http://nlp.uoregon.edu/download/embeddings/glove.6B.50d.txt -O glove.6B.50d.txt

# !python3 -m pip install --upgrade pip
# !pip3 install nltk
# !pip install pandas

In [2]:
import os
import numpy as np
import pandas as pd
import nltk
import string
import tensorflow.compat.v1 as tf

tf.disable_v2_behavior()
tf.disable_eager_execution()

2023-08-25 10:06:15.974136: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
TRAIN_PATH_POS = "aclImdb/train/pos/"
TRAIN_PATH_NEG = "aclImdb/train/neg/"

TEST_PATH_POS = "aclImdb/test/pos/"
TEST_PATH_NEG = "aclImdb/test/neg/"

GLOVE_PATH = "glove.6B.50d.txt"

In [4]:
BATCH_SIZE = 32
LEARNING_RATE = 0.001
min_length = 30
max_length = 200

In [5]:
class ReadDataSet:

    def __init__(self, path, label, min_length=30, max_length=200):
        self.path = path
        self.label = label
        self.dataset = []
        self.min_length = min_length
        self.max_length = max_length

        
    def parse(self):
        self.label_array = [1,0] if self.label == "positive" else [0,1]
        for root, subfolder, file_names in os.walk(self.path):
            for file_name in file_names:
                temp_file = open(os.path.join(root, file_name), encoding="utf8")
                text = temp_file.readline()
                length = sum([1 for word in text.split(" ") if word])
                if length > self.min_length and length < self.max_length+1:
                    self.dataset.extend([{"FileName":file_name, "Label":self.label_array, "Text":text}])
                temp_file.close()
        return pd.DataFrame(self.dataset)

In [6]:
train_pos_samples = ReadDataSet(TRAIN_PATH_POS, "positive").parse()
train_neg_samples = ReadDataSet(TRAIN_PATH_NEG, "negative").parse()

test_pos_samples = ReadDataSet(TEST_PATH_POS, "positive").parse()
test_neg_samples = ReadDataSet(TEST_PATH_NEG, "negative").parse()

train_samples = pd.concat([train_pos_samples, train_neg_samples], ignore_index=True)
test_samples  = pd.concat([test_pos_samples, test_neg_samples], ignore_index=True)

In [7]:
class CleanDataSet:
    
    def __init__(self,data_frame):
        self.data_frame = data_frame
        
    def remove_punctuations(self,text):
        text_nopunct = "".join([char for char in text if char not in string.punctuation])
        return text_nopunct  

    def remove_extraspaces(self, text):
        return [ item for item in text.split() if item]
    
    def clean(self):
        self.data_frame["Text"]   = self.data_frame["Text"].apply(lambda x: x.lower())
        self.data_frame["Text"]   = self.data_frame["Text"].apply(lambda x: self.remove_punctuations(x))
        self.data_frame["Text"]   = self.data_frame["Text"].apply(lambda x: self.remove_extraspaces(x))
        self.data_frame["Length"] = self.data_frame["Text"].apply(lambda x: len(x))
        return self.data_frame

In [8]:
train_samples = CleanDataSet(train_samples).clean()
test_samples = CleanDataSet(test_samples).clean()

In [9]:
class Encoder:
    
    def __init__(self, glove_path):
        
        self.glove_path = glove_path
        self.glove_dict = {}
        self.read_glove()
        self.ps = nltk.PorterStemmer()

    def read_glove(self):
        glove_file = open(self.glove_path, encoding="utf8")
        for line in glove_file.readlines():
            line_list = line.split(" ")
            temp_list = line_list[1:-1]
            temp_list.append(line_list[-1].split("\n")[0])
            float_vector = [float(vector) for vector in temp_list]
            self.glove_dict[line_list[0]] = np.array(float_vector)
        glove_file.close()
        
    def glove_formatting(self, x):
        vec_list = []
        for item in x:
            try:
                vec_list.append(self.glove_dict[item])
            except:
                try:
                    vec_list.append(self.glove_dict[self.ps.stem(item)])
                except:
                    vec_list.append(np.array([0.0 for i in range(50)]))
        return np.array(vec_list).astype("float16")
    
    def encode(self, data_frame):
        data_frame["Embedding"] = data_frame["Text"].apply(lambda x : self.glove_formatting(x))
        return data_frame
        

In [10]:
encoder = Encoder(GLOVE_PATH)
train_samples = encoder.encode(train_samples)
test_samples = encoder.encode(test_samples)

In [11]:
train_samples.sort_values(by="Embedding", key=lambda x: x.str.len())

Unnamed: 0,FileName,Label,Text,Length,Embedding
11541,9599_1.txt,"[0, 1]","[boring, badly, written, italian, exploitation...",31,"[[-0.03568, -0.4177, -0.441, -0.3545, -0.2876,..."
8655,3935_1.txt,"[0, 1]","[cool, idea, botched, writing, botched, direct...",31,"[[-0.656, 0.4565, -0.1675, -0.5835, -0.2307, -..."
859,2188_10.txt,"[1, 0]","[with, very, little, screen, time, and, money,...",31,"[[0.256, 0.437, -0.1189, 0.2035, 0.4197, 0.858..."
7550,2267_1.txt,"[0, 1]","[please, avoid, this, movie, at, all, costs, t...",31,"[[-0.434, 0.7397, 0.784, -0.4192, 0.479, -0.90..."
10861,10387_1.txt,"[0, 1]","[giant, crabs, cursing, in, japanese, what, wa...",31,"[[1.247, -0.2174, 0.4365, 1.927, 0.499, 0.2177..."
...,...,...,...,...,...
8942,1750_4.txt,"[0, 1]","[priyadarshans, hera, pheri, was, a, nice, sit...",200,"[[0.1812, -1.669, -0.3264, -0.1554, -1.024, 0...."
721,12494_8.txt,"[1, 0]","[finally, an, indie, film, that, actually, del...",200,"[[0.1626, -0.3728, -0.02248, -0.505, 0.2747, -..."
1917,3246_9.txt,"[1, 0]","[i, first, saw, this, movie, on, cable, about,...",200,"[[0.1189, 0.1526, -0.0821, -0.741, 0.7593, -0...."
1989,6229_7.txt,"[1, 0]","[one, of, several, musicals, about, sailors, o...",200,"[[0.3147, 0.4165, 0.1348, 0.1586, 0.888, 0.433..."


In [12]:
test_samples.sort_values(by="Embedding", key=lambda x: x.str.len())

Unnamed: 0,FileName,Label,Text,Length,Embedding
12521,2164_4.txt,"[0, 1]","[my, first, thoughts, on, this, film, were, of...",31,"[[-0.2727, 0.7754, -0.1018, -0.9165, 0.905, -0..."
7168,11360_10.txt,"[1, 0]","[i, smiled, through, the, whole, film, the, mu...",31,"[[0.1189, 0.1526, -0.0821, -0.741, 0.7593, -0...."
11777,7628_1.txt,"[0, 1]","[what, a, stinker, i, swear, this, movie, was,...",31,"[[0.4531, 0.0598, -0.1058, -0.333, 0.7236, -0...."
13673,361_1.txt,"[0, 1]","[i, have, never, fallen, asleep, whilst, watch...",31,"[[0.1189, 0.1526, -0.0821, -0.741, 0.7593, -0...."
10430,5255_1.txt,"[0, 1]","[im, sure, he, doesnt, need, the, money, for, ...",31,"[[-0.0677, 0.5186, 1.326, -0.3867, -0.7974, -1..."
...,...,...,...,...,...
12102,6877_3.txt,"[0, 1]","[here, he, is, a, new, horror, icon, for, the,...",200,"[[0.141, 0.682, -0.504, 0.383, 0.6343, -1.186,..."
13108,9483_2.txt,"[0, 1]","[the, boys, were, the, most, appealing, things...",200,"[[0.418, 0.2496, -0.4124, 0.1217, 0.3452, -0.0..."
4599,6346_10.txt,"[1, 0]","[ive, seen, lonesome, dove, dead, mans, walk, ...",200,"[[0.2286, -0.7954, -0.4978, -1.086, 0.03766, -..."
1339,10321_10.txt,"[1, 0]","[this, is, the, very, first, three, stooges, s...",200,"[[0.531, 0.4011, -0.408, 0.1544, 0.4778, 0.207..."


In [13]:
class Model:
    
    def __init__(self, CellDim=50, Classes=2, NumLayers=1, DropRate=0.4):
        
        self.input = tf.placeholder(shape=(None, None, CellDim), dtype="float32", name="input")
        self.input_length = tf.placeholder(shape=(None), dtype="int32", name="input_length")
        self.drop_rate = tf.placeholder(dtype=tf.float32, name="dropout_rate")
        self.ground_truth = tf.placeholder(shape=(None, Classes), dtype="float32", name="groundtruth")
        self.CellDim = CellDim
        self.Classes = Classes
        self.NumLayers = NumLayers
        self.DropRate = DropRate
    
    def forward(self):

        output, state = tf.nn.dynamic_rnn(cell=tf.nn.rnn_cell.LSTMCell(self.CellDim), 
                                          inputs=self.input, 
                                          sequence_length=self.input_length, 
                                          dtype="float32")
        
        for i in range(self.NumLayers - 1):
            with tf.variable_scope("Layer_"+str(i)):
                output, state = tf.nn.dynamic_rnn(cell=tf.nn.rnn_cell.LSTMCell(self.CellDim), 
                                              inputs=output, 
                                              sequence_length=self.input_length, 
                                              dtype="float32")
            
        dropout = tf.nn.dropout(state.h, rate=self.DropRate)
        self.predictions = tf.layers.dense(dropout, units=self.Classes, activation=tf.nn.softmax, name="predictions")
        
        return self.input, self.input_length, self.drop_rate, self.ground_truth, self.predictions

In [14]:
SA_Model = Model(CellDim=50, Classes=2, NumLayers=1, DropRate=0.4)

In [15]:
input_tensor, input_length, drop_rate, ground_truth, predictions = SA_Model.forward()

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


  output, state = tf.nn.dynamic_rnn(cell=tf.nn.rnn_cell.LSTMCell(self.CellDim),
  self.predictions = tf.layers.dense(dropout, units=self.Classes, activation=tf.nn.softmax, name="predictions")


In [16]:
class Trainer:
    def __init__(self, ground_truth, predictions):
        self.ground_truth = ground_truth
        self.predictions = predictions

    def metrics(self):                                                                                                            
        self.loss = tf.compat.v1.losses.softmax_cross_entropy(self.ground_truth, self.predictions)
        self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self.predictions, 1), 
                                                   tf.argmax(self.ground_truth, 1)), "float32"))
        return self.loss, self.accuracy
    
    def apply(self):
        self.loss, self.accuracy = self.metrics()
        Optimizer = tf.train.AdamOptimizer().minimize(self.loss)
        Session = tf.Session()
        Session.run(tf.global_variables_initializer())
        return Optimizer, Session, self.loss, self.accuracy

In [17]:
Optimizer, Session, loss, accuracy = Trainer(ground_truth, predictions).apply()

2023-08-25 10:06:29.888569: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13604 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:81:00.0, compute capability: 7.5
2023-08-25 10:06:29.894474: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled


In [18]:
class Train(Model):
    
    def __init__(self, train_data, test_data, input_tensor, input_length, drop_rate, ground_truth, predictions, \
                 Optimizer, Session, Loss, Accuracy, BATCH_SIZE, Epochs):
        
        self.loss = []
        self.accuracy = []
        
        self.train_data = train_data
        self.test_data = test_data
        
        self.input_tensor = input_tensor
        self.input_length = input_length
        self.droprate= drop_rate
        self.prediction = predictions
        
        self.Optimizer = Optimizer
        self.Session = Session
        self.Loss = Loss
        self.Accuracy = Accuracy
        self.BATCH_SIZE = BATCH_SIZE
        self.Epochs = Epochs
        
        self.run()
                                                
    def train(self, input_data):
        
        loss = []
        accuracy = []
        
        for j in range(len(input_data)//BATCH_SIZE):

            input_batch = []
            groundtruth_batch = []
            length_batch = []

            for i in range(j, j+BATCH_SIZE) :
                groundtruth_batch.append(input_data["Label"][i])
                length_batch.append(input_data["Length"][i])

            
            for i in range(j, j+BATCH_SIZE): 
                data_array = np.zeros([max(length_batch), 50]).astype("float32")
                (a,b) = input_data["Embedding"][i].shape
                data_array[:a, :b] = input_data["Embedding"][i].astype("float32")
                input_batch.append(data_array)
                
            _, batch_loss, batch_accuracy = self.Session.run([Optimizer, loss, accuracy], 
                                                 feed_dict={input_tensor: np.array(input_batch).astype("float32"),
                                                            input_length : np.array(length_batch).astype("float32"),
                                                            drop_rate : 0.4, 
                                                            ground_truth : np.array(groundtruth_batch).astype("float32")})
            loss.append(batch_loss)
            accuracy.append(batch_accuracy)
        print("Train Loss : {:.5f}    |   Train Accuracy : {:.5f}".format(sum(self.loss)/len(self.loss), sum(self.accuracy)/len(self.accuracy)))
        
    def test(self, input_data):
        
        loss = []
        accuracy = []
        
        for j in range(len(input_data)//BATCH_SIZE):

            input_batch = []
            groundtruth_batch = []
            length_batch = []

            for i in range(j, j+BATCH_SIZE) :
                groundtruth_batch.append(input_data["Label"][i])
                length_batch.append(input_data["Length"][i])

            
            for i in range(j, j+BATCH_SIZE): 
                data_array = np.zeros([max(length_batch), 50]).astype("float32")
                (a,b) = input_data["Embedding"][i].shape
                data_array[:a, :b] = input_data["Embedding"][i].astype("float32")
                input_batch.append(data_array)
                
            batch_loss, batch_accuracy = self.Session.run([loss, accuracy], 
                                                 feed_dict={input_tensor: np.array(input_batch).astype("float32"),
                                                            input_length : np.array(length_batch).astype("float32"),
                                                            drop_rate : 0.0, 
                                                            ground_truth : np.array(groundtruth_batch).astype("float32")})
            
            loss.append(batch_loss)
            accuracy.append(batch_accuracy)
        print("Test  Loss : {:.5f}    |   Test  Accuracy : {:.5f}".format(sum(loss)/len(loss), sum(accuracy)/len(accuracy)))

    def run(self):
        print("*"*100)
        print("Initial Run on Data Set without Optimizer : ")
        self.test(self.train_data)
        self.test(self.test_data)
        for n in range(self.Epochs):
            print("*"*100)
            print("Epoch : {:03d}".format(n+1))
            self.train(self.train_data)
            self.test(self.test_data)
        

In [None]:
RUNNER = Train(train_samples, test_samples, input_tensor, input_length, drop_rate, ground_truth, predictions, \
                 Optimizer, Session, loss, accuracy, BATCH_SIZE, 100)

****************************************************************************************************
Initial Run on Trian Set without Optimizer : 
Test Loss  : 0.68182    |   Test Accuracy  : 0.57319
Test Loss  : 0.68183    |   Test Accuracy  : 0.56102
****************************************************************************************************
Epoch : 001
Train Loss : 0.36775    |   Train Accuracy : 0.98643
Test Loss  : 0.31585    |   Test Accuracy  : 1.00000
****************************************************************************************************
Epoch : 002
Train Loss : 0.31452    |   Train Accuracy : 1.00000
Test Loss  : 0.31396    |   Test Accuracy  : 1.00000
****************************************************************************************************
Epoch : 003
Train Loss : 0.31374    |   Train Accuracy : 1.00000
Test Loss  : 0.31363    |   Test Accuracy  : 1.00000
******************************************************************************************

Train Loss : 0.31326    |   Train Accuracy : 1.00000
Test Loss  : 0.31326    |   Test Accuracy  : 1.00000
****************************************************************************************************
Epoch : 038
Train Loss : 0.31326    |   Train Accuracy : 1.00000
Test Loss  : 0.31326    |   Test Accuracy  : 1.00000
****************************************************************************************************
Epoch : 039
Train Loss : 0.31326    |   Train Accuracy : 1.00000
Test Loss  : 0.31326    |   Test Accuracy  : 1.00000
****************************************************************************************************
Epoch : 040
Train Loss : 0.31326    |   Train Accuracy : 1.00000
Test Loss  : 0.31326    |   Test Accuracy  : 1.00000
****************************************************************************************************
Epoch : 041
Train Loss : 0.31326    |   Train Accuracy : 1.00000
Test Loss  : 0.31326    |   Test Accuracy  : 1.00000
******************