In [None]:
#Importing necessary libraries
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing import text_dataset_from_directory
import re
import string
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import LeakyReLU

In [4]:
#uploading file path [install winrar and extract the file through that, and copy path as shown below]
base_path ='C:/Users/adapa/Downloads/aclImdb_v1/aclImdb'
imdb_path_train='/train'
imdb_path_test='/test'

#the path to the training set of the IMDb dataset to create the full path to the training set
full_path = base_path+imdb_path_train
batch_size = 32

#to generate a TensorFlow dataset that includes the training, testing, and validation sets
full_path = base_path+imdb_path_train
df_imdb_train = text_dataset_from_directory(full_path,batch_size= batch_size, validation_split=0.2, subset='training', seed= 42 )
df_imdb_val = text_dataset_from_directory(full_path,batch_size= batch_size, validation_split=0.2, subset='validation', seed= 42 )
full_path = base_path+imdb_path_test
df_imdb_test = text_dataset_from_directory(full_path,batch_size= batch_size)


Found 75000 files belonging to 3 classes.
Using 60000 files for training.
Found 75000 files belonging to 3 classes.
Using 15000 files for validation.
Found 25000 files belonging to 2 classes.


In [5]:
#prints information about the generated TensorFlow datasets and ensures to verify correct dataset
print(type(df_imdb_train))
print(df_imdb_train.take(1))
print(f"Number of batches in df_imdb_train: {df_imdb_train.cardinality()}")
print(f"Number of batches in df_imdb_val: {df_imdb_val.cardinality()}")
print(f"Number of batches in df_imdb_test: {df_imdb_test.cardinality()}")

<class 'tensorflow.python.data.ops.batch_op._BatchDataset'>
<_TakeDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>
Number of batches in df_imdb_train: 1875
Number of batches in df_imdb_val: 469
Number of batches in df_imdb_test: 782


In [6]:
#method to examine the dataset's contents and confirm its successful loading and processing
for text_batch, label_batch in df_imdb_train.take(1):
    for i in range(10):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])

b'There is this father-son conversation in the climax of \'KALPURUSH\'. I quote the English DVD-subtitle version. Shumonto tells his father: "I may not have become someone, but when I see two people in love, I smile. And when I see someone eating alone, I cry." Ashvini, his father, replies wistfully: "I wish I could\'ve lived my life like you did." These 2 lines, perhaps, comprise the gist of this new film by Buddhadev Dasgupta - director of teeny-weeny gems like \'Tahader Katha\', \'Bagh Bahadur\', \'Uttara\' & \'Mondo Meyer Upakhyan\' - which took nearly 3 years to reach the cinemas in India.<br /><br />The film opens with a man called Ashvini following a younger man called Shumonto, who, we are told, is his son. It seems that the father is stalking - or haunting, rather - his son. As the film progresses and we meet Shumonto\'s ambitious wife, Supriya, and his mother, Koyel, who seems to be tied up with something in her past, we realise that the son is, indeed, haunted by his father 

In [7]:
def custom_standarize_data(data):
    data_filtered = tf.strings.lower(data)
    data_filtered = tf.strings.regex_replace(data_filtered, "<br />", " ")
    data_filtered = tf.strings.regex_replace(data_filtered, f"[{re.escape(string.punctuation)}]", "")
    return data_filtered

In [11]:
max_features = 20000

unigram_vectorizer = TextVectorization(
    standardize=custom_standarize_data,
    max_tokens=max_features,
    output_mode='tf-idf',
    ngrams=1,
    #output_sequence_length=sequence_length)# can ony be used when output_mode = int
    )
#bi-gram model
bigram_vectorizer = TextVectorization(
    standardize=custom_standarize_data,
    max_tokens=max_features,
    output_mode='tf-idf',
    ngrams=2,
    #output_sequence_length=sequence_length)
    )

In [12]:
text_ds = df_imdb_train.map(lambda x, y: x)
unigram_vectorizer.adapt(text_ds)
bigram_vectorizer.adapt(text_ds)

In [15]:
#sequential model using the Keras API, predict binary sentiment labels, bag-of-words model with unigrams
unigram_model = tf.keras.models.Sequential([
    unigram_vectorizer,
    tf.keras.layers.Dense(52, activation=LeakyReLU(alpha=0.1)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
#compile and fit
unigram_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
unigram_model.fit(df_imdb_train, validation_data=df_imdb_val, epochs=9)

loss, accuracy = unigram_model.evaluate(df_imdb_test)

print('performance of the unigram mode:')
print("accuracy of the test set: {:.2f}".format(accuracy))
loss, accuracy = unigram_model.evaluate(df_imdb_test)
print("accuracy of the test set: {:.2f}".format(accuracy))
#create a bag-of-words model with bigrams, LeakyReLU activation function
bigram_model = tf.keras.models.Sequential([
    bigram_vectorizer,
    tf.keras.layers.Dense(52, activation=LeakyReLU(alpha=0.1)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

#compile and fit the bag-of-words model with bigrams
bigram_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
bigram_model.fit(df_imdb_train, validation_data=df_imdb_val, epochs=10)

print('Bigram model performance:')
#NOTE: We can add an embedding layer in another implementation
loss, accuracy = bigram_model.evaluate(df_imdb_test)
print("accuracy of the test set: {:.2f}".format(accuracy))


Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
performance of the unigram mode:
accuracy of the test set: 0.50
accuracy of the test set: 0.50
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Bigram model performance:
accuracy of the test set: 0.50


In [16]:
print(type(df_imdb_test))

<class 'tensorflow.python.data.ops.batch_op._BatchDataset'>


In [17]:
#sequential model with LSTM and embedding layer
seq_vectorizer = TextVectorization(
    standardize=custom_standarize_data,
    max_tokens=max_features)

seq_vectorizer.adapt(text_ds)

embedding_dim = 400
seq_model = tf.keras.models.Sequential([seq_vectorizer,
                                         tf.keras.layers.Embedding(input_dim = max_features, output_dim= embedding_dim),
                                         tf.keras.layers.Dropout(0.2),
                                         tf.keras.layers.LSTM(32),
                                         tf.keras.layers.Dropout(0.4),
                                         tf.keras.layers.Dense(1,activation="sigmoid")])

#compile and fit the bag-of-words model with bigrams
seq_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
bigram_model.fit(df_imdb_train, validation_data=df_imdb_val, epochs=24)

print('performance of the sequential model:')
loss, accuracy = bigram_model.evaluate(df_imdb_test)



Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24
performance of the sequential model:
