In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
# nltk.download('all')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(stopwords.words('english'))
from sklearn.metrics import classification_report, confusion_matrix
import os
from tqdm import tqdm
tqdm.pandas()
from collections import Counter

import os, pathlib, shutil, random
import tensorflow as tf

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## Download IMDB Data

In [2]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  10.1M      0  0:00:07  0:00:07 --:--:-- 17.2M


In [3]:
# read data
# data = pd.read_csv('data/IMDB Dataset.csv')
# data.head()

## Prepare data

### Creating and organizing files in folders

In [4]:
batch_size = 32
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)



### Load train, validation and test datasets from folders

In [5]:
train_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size
)
test_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)


Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [6]:
text_only_train_ds = train_ds.map(lambda x, y: x)

In [7]:
counter = 0
for x, y in train_ds:
    for text, label in zip(x,y):
        counter += 1
        print("Review :", text)
        print("Label: ", label)
        print("\n")
        if counter > 5:
            break
    break

Review : tf.Tensor(b"Kirstie Alley, looking a bit slimmer, but only a bit, is in this mess along with a man who is a MacGuyver lookalike, bleached blond hair and all. The premise of the movie is about an older woman (50!!!) who cannot get her screenplay produced due to age discrimination so she sends in her younger nephew to pose as the writer. Not an original idea and not a very good movie with lousy acting, inane dialogue and a ridiculous plot. There is another plot concerning a writer with a crush or admiration for Kirstie's character and why this is included is a mystery. The actor who portrays Kirstie's brother is so wooden and miscast, it was torture to watch their scenes. What is there to say about this film. Avoid it.", shape=(), dtype=string)
Label:  tf.Tensor(0, shape=(), dtype=int32)


Review : tf.Tensor(b"I had a chance to see a screening of this movie recently. I believe that it will be in theaters in Canada some time around Mother's Day. If it is in a theater near you... 

### Convert text data into numeric

In [8]:


max_length = 600
max_tokens = 20000
text_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)




In [9]:
counter = 0
for x, y in int_train_ds:
    for text, label in zip(x,y):
        counter += 1
        print("Review :", text)
        print("Label: ", label)
        print("\n")
        if counter >= 1:
            break
    break

Review : tf.Tensor(
[   11    18    14   641    33  2539  7030    38    23   118    12     2
   873   152    26    73   285     6   162  5097    13   254    47   719
   196   724     2  9553     8    11    18    80     5     2    51   762
  2331   460   592  4835  1416    49    24   933  3866    43    11   702
    13  4287    15   246    33     2   524  1046     1     1     7     4
  1257   109    47    83    39   127    93  5122    80     2   565    69
   294  1424    19   131     2     1     5   496  2628  3262    16     8
     4   811    19 10090  1358    24    48  3137     2 15947     5  4287
  1151  4287 11083   756    31     2   130    16    34   503     5   115
   113    87     5   127    93   473    79     2   762    32    46   546
     6     2   343     3     2   426   109  1446    41    15   426     8
     2   130    15     6   885  5097    13    47  2720  8992     8    11
   438    19    10    26     6   194     9  1102    46     5   295   391
   450    21    30     1     3 

### A sequence model built on one-hot encoded vector sequences

In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Lambda(lambda x: tf.one_hot(x, depth=max_tokens)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation="sigmoid")
])


In [11]:
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])


In [12]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=1)



<keras.src.callbacks.History at 0x79089f973760>

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lambda (Lambda)             (None, 600, 20000)        0         
                                                                 
 bidirectional (Bidirection  (None, 256)               20612096  
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 1)                 257       
                                                                 
Total params: 20612353 (78.63 MB)
Trainable params: 20612353 (78.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

In [14]:
## Try with embedding layer instead of one-hot encoding

In [15]:
embedding_layer = tf.keras.layers.Embedding(input_dim=max_tokens, output_dim=256)

### Build Model (Embedding from scratch)

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation="sigmoid")
])


In [17]:
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

In [18]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7908817c9ea0>

In [19]:
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.851


In [20]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 256)         5120000   
                                                                 
 bidirectional_1 (Bidirecti  (None, 256)               394240    
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 5514497 (21.04 MB)
Trainable params: 5514497 (21.04 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Use pretrained embedding instead of building from scratch

In [21]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2023-09-10 08:28:25--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-09-10 08:28:25--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-09-10 08:28:26--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [22]:
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


### prepare Glove word Embedding matrix

In [23]:
embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [24]:
embedding_layer = tf.keras.layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

## Build Model with pretrained Embedding

In [25]:
model = tf.keras.Sequential([
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [1]:
import tensorflow as tf
tf.keras.layers.

SyntaxError: invalid syntax (1506051000.py, line 2)

In [26]:
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

In [27]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79080a2934c0>

In [28]:
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.876


In [29]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 100)         2000000   
                                                                 
 bidirectional_2 (Bidirecti  (None, 256)               234496    
 onal)                                                           
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 257       
                                                                 
Total params: 2234753 (8.52 MB)
Trainable params: 234753 (917.00 KB)
Non-trainable params: 2000000 (7.63 MB)
_________________________________________________________________
