In [1]:
import tensorflow as tf 

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Embedding,GlobalAvgPool1D,TextVectorization


In [2]:
import io
import os 
import shutil
import string
import re
from os.path import join
from glob import glob

# Downloading the IMDB Dataset

In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [4]:
train_dir=join(dataset_dir,'train')

In [5]:
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

'neg' folders contains Negative review, 

'pos' folders contains Positive review, 

For this turtorial we will not require 'unsup' folder

In [6]:
remove_dir=join(train_dir,'unsup')

shutil.rmtree(remove_dir)

In [7]:
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [8]:
## Let us see how is the positve text 

pos_text_dires=glob(join(train_dir,'pos')+'/*txt')

with open(pos_text_dires[0],mode='rb') as file:
    pos_text=file.read().decode('UTF-8')
    
print('Positive Text:::\n',pos_text)



Positive Text:::
 Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [18]:
## Let us see how is the positve text 

neg_text_dires=glob(join(train_dir,'neg')+'/*txt')

with open(neg_text_dires[0],mode='rb') as file:
    neg_text=file.read().decode('UTF-8')
    
print('Negative Text:::\n',neg_text)


Negative Text:::
 Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [19]:
batch_size = 1024
seed = 123
train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [47]:
import os

os.walk('aclImdb/train')

<generator object _walk at 0x0000020005C8F4A0>

In [45]:
for text_data,label in train_ds.take(1):
    print('Label : ', label[0])
    print(text_data[0])

Label :  tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(b"In a series chock-full of brilliant episodes, this one stands out as one of my very favorites. It's not the most profound episode, there's no great meaning or message. But it's a lot of fun, and there are some fine performances.<br /><br />But what makes it really stand out for me is that it is, to my knowledge, the *only* Twilight Zone episode with a *double* snapper ending. The Zone is rightly famous for providing a big surprise at the end of a story. But this time, you get a surprise, and think that's that, but it turns out there's *another* surprise waiting. I just like that so much, that this is probably one of my two favorite episodes (the other being a deeper, more message-oriented one).", shape=(), dtype=string)


In order to simplify our task, we need to do some preprocessing of the text: 
- Lowecase Every Character 
- remove some special character like '<br>' and replace it with ' '(Removing HTML syntax) 
- And remove all the specific punctuations like *@, #, $,%,&*

In [21]:
## let us see the special Puntuations 
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [22]:
stripped_html = "Hello, [World]! This is an example."
pattern = '[%s]' % re.escape(string.punctuation)
result = re.sub(pattern, '', stripped_html)
print(result)

Hello World This is an example


In [23]:
## Defining using tensoroperation

def custom_standerization(input_data):
    lowercase=tf.strings.lower(input_data)
    stripped_html=tf.strings.regex_replace(lowercase,'<br>',' ')
    stripped_punc=tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation),'')
    return stripped_punc

In [24]:
custom_standerization(text_data[0]) ## Applying this to above text_data from train_ds.take(1)

<tf.Tensor: shape=(), dtype=string, numpy=b'oh my god please for the love of all that is holy do not watch this movie it it 82 minutes of my life i will never get back sure i could have stopped watching half way through but i thought it might get better it didnt anyone who actually enjoyed this movie is one seriously sick and twisted individual no wonder us australiansnew zealanders have a terrible reputation when it comes to making movies everything about this movie is horrible from the acting to the editing i dont even normally write reviews on here but in this case ill make an exception i only wish someone had of warned me before i hired this catastrophe'>

## TextVectorizing the words

In [25]:
## defining the vocab_size, and seq_length(numbers of words in a sequence)

vocab_size=10000
sequence_length=100

## Defining text_vectorization 

vectorize_layer=TextVectorization(standardize=custom_standerization,
                                 max_tokens=vocab_size,
                                 output_mode='int',
                                 output_sequence_length=sequence_length)

## We have to apply the vectorization to only text of dataset

text_ds=train_ds.map(lambda x,y:x) ## We only adapt to text_ds

##In the training, vectorize_layer only affect text data of train_ds not text data


In [26]:
vocab = vectorize_layer.get_vocabulary()
print(vocab)

['', '[UNK]']


In [27]:
## before adapt

for vec_text in text_ds.take(1): ## Why no preprocessing 
    print(vec_text[0])

tf.Tensor(b"Wow. Some movies just leave me speechless. This was undeniably one of those movies. When I left the theatre, not a single word came to my mouth. All I had was an incredible urge to slam my head against the theatre wall to help me forget about the last hour and a half. Unfortunately, it didn't work. Honestly, this movie has nothing to recommend. The humor was at the first grade level, at best, the acting was overly silly, and the plot was astronomically far-fetched. I hearby pledge never to see an other movie starring Chris Kattan or any other cast-member of SNL.", shape=(), dtype=string)


In [28]:
vectorize_layer.adapt(text_ds)

In [29]:
## After Adapting
for vec_text in text_ds.take(1): 
    print(vec_text[0])

tf.Tensor(b'I believe this is the most powerful film HBO Pictures has made to date. This film should have been released in theaters for the public to view on the big screen. It is available on video so make sure you look for it and check it out. Chris Gerolmo did a great job with the direction and the screenplay. The performances from Stephen Rea, Donald Sutherland and Jeffery DeMunn are flawless. A masterpiece of the genre.', shape=(), dtype=string)


# Creating a classification model and Compiling

In [30]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAvgPool1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [31]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [32]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [33]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1fe7da072e0>

# Testing the model

In [34]:
example=tf.constant(['This movie was so good'])

In [43]:
def movie_review(model,text):
    text=text[tf.newaxis]
    pred_logit=model.predict(text)
    prob=tf.sigmoid(pred_logit)
    
    if prob >=0.5:
        print("Positive Review")
    else:
        print("Negative Review")

In [44]:
movie_review(model,tf.constant(['The movie was boring']))

Positive Review


In [17]:
m

NameError: name 'model' is not defined

In [143]:
#docs_infra: no_execute
%load_ext tensorboard
%tensorboard --logdir logs

In [144]:
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'br',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'on',
 'not',
 'you',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'who',
 'from',
 'so',
 'like',
 'her',
 'just',
 'or',
 'about',
 'has',
 'if',
 'out',
 'some',
 'there',
 'what',
 'good',
 'when',
 'more',
 'very',
 'even',
 'she',
 'my',
 'up',
 'no',
 'would',
 'only',
 'which',
 'time',
 'really',
 'story',
 'their',
 'see',
 'had',
 'were',
 'can',
 'me',
 'we',
 'than',
 'much',
 'well',
 'been',
 'will',
 'get',
 'also',
 'people',
 'into',
 'do',
 'other',
 'first',
 'bad',
 'great',
 'because',
 'how',
 'most',
 'him',
 'dont',
 'made',
 'then',
 'movies',
 'make',
 'could',
 'way',
 'films',
 'any',
 'them',
 'after',
 'too',
 'characters',
 'think',
 'watch',
 'being',
 'two',
 'many',
 'seen',
 'character',
 'never',
 'little',
 'where',
 'plot',
 'acting',
 'be