In [1]:
import matplotlib.pyplot as plt
import re
import os
import shutil
import tensorflow as tf
import string


In [2]:
print(tf.__version__)

2.4.1


In [3]:
# data source at https://ai.stanford.edu/~amaas/data/sentiment/
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.', cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [4]:
os.listdir(dataset_dir)

['test', 'imdb.vocab', 'README', 'train', 'imdbEr.txt']

In [5]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'urls_neg.txt',
 'unsup',
 'urls_unsup.txt',
 'neg',
 'pos',
 'urls_pos.txt',
 'unsupBow.feat']

In [6]:
sampleFile = os.path.join(train_dir, 'pos/1181_9.txt')

In [7]:
with open(sampleFile) as f:
  print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


In [8]:
# remove all unsupported types
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [9]:
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=42
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [10]:
raw_train_ds.take(1)

<TakeDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [17]:
for text, label in raw_train_ds.take(1):
  for i in range(5):
    print("review: ", text.numpy()[i])
    print("label: ", label.numpy()[i])

review:  b"I went to see Fever Pitch with my Mom, and I can say that we both loved it. It wasn't the typical romantic comedy where someone is pining for the other, and blah blah blah... You weren't waiting for the climatic first kiss or for them to finally get together. It was more real, because you saw them through the relationship, rather than the whole movie be about them getting together. People could actually relate to the film, because it didn't seem like extraordinary circumstances, or impossible situations. It was really funny, and I think it was Jimmy Fallon's best performance. All in all... I would definitely recommend it!"
label:  1
review:  b"from the view of a NASCAR Maniac like I am, the movie is interesting. You can see many race cars from 1983. Even tough, the racing scenes are not that much realistic. But I have to admit, that I haven't seen any race before 1995, because before that time, they didn't show any NASCAR races in Germany)<br /><br />from the view of a Burt 

In [18]:
raw_train_ds.class_names[0]

'neg'

In [19]:
raw_train_ds.class_names[1]

'pos'

In [20]:
raw_train_ds.__dict__ # one way-> open entire object as dictionary

{'_batch_size': <tf.Tensor: shape=(), dtype=int64, numpy=32>,
 '_drop_remainder': <tf.Tensor: shape=(), dtype=bool, numpy=False>,
 '_graph_attr': <tensorflow.python.framework.ops.Graph at 0x7f5f22888710>,
 '_input_dataset': <ShuffleDataset shapes: ((), ()), types: (tf.string, tf.int32)>,
 '_options_attr': <tensorflow.python.data.ops.dataset_ops.Options at 0x7f5f2287c3d0>,
 '_self_name_based_restores': set(),
 '_self_saveable_object_factories': {},
 '_self_setattr_tracking': True,
 '_self_unconditional_checkpoint_dependencies': [TrackableReference(name='_variant_tracker', ref=<tensorflow.python.data.ops.dataset_ops._VariantTracker object at 0x7f5f23837250>)],
 '_self_unconditional_deferred_dependencies': {},
 '_self_unconditional_dependency_names': {'_variant_tracker': <tensorflow.python.data.ops.dataset_ops._VariantTracker at 0x7f5f23837250>},
 '_self_update_uid': -1,
 '_structure': (TensorSpec(shape=(None,), dtype=tf.string, name=None),
  TensorSpec(shape=(None,), dtype=tf.int32, name

In [22]:
print(list(dir(raw_train_ds))) # one way-> open entire object as directory

['_GeneratorState', '__abstractmethods__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_add_variable_with_custom_getter', '_apply_options', '_as_serialized_graph', '_batch_size', '_checkpoint_dependencies', '_consumers', '_deferred_dependencies', '_drop_remainder', '_flat_shapes', '_flat_structure', '_flat_types', '_functions', '_gather_saveables_for_checkpoint', '_graph', '_graph_attr', '_handle_deferred_dependencies', '_has_captured_ref', '_input_dataset', '_inputs', '_list_extra_dependencies_for_serialization', '_list_functions_for_serialization', '_lookup_dependency', '_map_resources', '_maybe_initialize_trackab

In [23]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=42
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [24]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size
)

Found 25000 files belonging to 2 classes.


In [25]:
# define all HPs
max_features = 10000
sequence_length = 250 # smaller reviews must be padded, and larger reviews must be chopped


In [26]:
# get rid of all unwanted words, tokens.... such as .<br />


def scrub_fn(text):
  lowercase = tf.strings.lower(text)
  strip_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  newtext = tf.strings.regex_replace(strip_html, '[%s]' % re.escape(string.punctuation), '')
  return newtext

#vectorize -> convert words into respective vectors 
vectorize_layer = TextVectorization(standardize=scrub_fn,
                                    max_tokens= max_features,
                                    output_mode='int',
                                    output_sequence_length = sequence_length)

In [30]:
# text -only dataset, without the labels 
traintext = raw_train_ds.map(lambda x , y: x)
#
traintext

<MapDataset shapes: (None,), types: tf.string>

In [31]:
for text in traintext.take(1):
  for i in range(5):
    print("review: ", text.numpy()[i])
    

review:  b'Every scene was put together perfectly.This movie had a wonderful cast and crew. I mean, how can you have a bad movie with Robert Downey Jr. in it,none have and ever will exist. He has the ability to brighten up any movie with his amazing talent.This movie was perfect! I saw this movie sitting all alone on a movie shelf in "Blockbuster" and like it was calling out to me,I couldn\'t resist picking it up and bringing it home with me. You can call me a sappy romantic, but this movie just touched my heart, not to mention made me laugh with pleasure at the same time. Even though it made me cry,I admit, at the end, the whole movie just brightened up my outlook on life thereafter.I suggested to my horror, action, and pure humor movie buff of a brother,who absolutely adored this movie. This is a movie with a good sense of feeling.It could make you laugh out loud, touch your heart, make you fall in love,and enjoy your life.Every time you purposefully walk past this movie, just be awa

In [32]:
vectorize_layer.adapt(traintext)

In [33]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [34]:
# take-> active pointer rolling on the dataset
textbatch, labelbatch = next(iter(raw_train_ds))
firstreview , firstlabel = textbatch[0], labelbatch[0]
print(firstreview)

tf.Tensor(b'"A young woman suffers from the delusion that she is a werewolf, based upon a family legend of an ancestor accused of and killed for allegedly being one. Due to her past treatment by men, she travels the countryside seducing and killing the men she meets. Falling in love with a kind man, her life appears to take a turn for the better when she is raped and her lover is killed by a band of thugs. Traumatized again by these latest events, the woman returns to her violent ways and seeks revenge on the thugs," according to the DVD sleeve\'s synopsis.<br /><br />Rino Di Silvestro\'s "La lupa mannara" begins with full frontal, writhing, moaning dance by shapely blonde Annik Borel, who (as Daniella Neseri) mistakenly believes she is a werewolf. The hottest part is when the camera catches background fire between her legs. The opening "flashback" reveals her hairy ancestor was (probably) a lycanthropic creature. Ms. Borel is, unfortunately, not a werewolf; she is merely a very strong

In [35]:
vectorize_text(firstreview, firstlabel)

(<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
 array([[   4,  181,  246, 2320,   35,    2,    1,   12,   55,    7,    4,
         1823,  443,  718,    4,  215, 1725,    5,   33,    1, 3420,    5,
            3,  546,   15, 8957,  108,   28,  684,    6,   39,  491, 2217,
           32,  348,   55, 4096,    2, 4169,    1,    3,  846,    2,  348,
           55,  879, 1387,    8,  115,   16,    4,  236,  130,   39,  116,
          723,    6,  190,    4,  459,   15,    2,  122,   51,   55,    7,
         3377,    3,   39, 1522,    7,  546,   32,    4, 1090,    5, 3616,
         8981,  169,   32,  129, 2379,  650,    2,  246, 1734,    6,   39,
         1097,  760,    3, 4972, 1087,   20,    2, 3616, 1668,    6,    2,
          287,    1, 3658,    1,    1,    1,  990,    1,    1,  762,   16,
          374, 7652,    1,    1,  833,   32,    1, 1941,    1,    1,   36,
           14,    1,    1,    1, 2101,   55,    7,    4, 1823,    2,    1,
          170,    7,   51,    2,  379, 3898,  985, 

In [36]:
# Sentiment Analysis
# y = mx + c
# y = weights * inputs + bias
# y = sentiment. So if y > 0.5, sent=POS, else sent=NEG

# inputs=> 10,000 words! 
# sentiment = w1*word1 + w2*word2... w10000*word10000 + bias 
# out of the above only 250 weights (w1...w10000) will be non-zero! 

# dictionary = [ going eating am I is he she sleeping writing and ]
# max length = 4
# s1-> i am writing
# s2-> she and i and writing and eating 
# s3-> he is going 

# y = w1*going + w2*eating + w3*am + w4*I + w5*is + w6.... w10* and + bias 

# s1 = w1*0 + w2*0 + w3*123 + w4*87 + w5*0.... w9*101 + w10*0 + bias 




In [37]:
allvectors = vectorize_layer.get_vocabulary()
allvectors[42]

'about'

In [38]:
allvectors[9042]

'owning'

In [42]:
train_ds = raw_train_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

In [43]:
# Data set tuning
# When data is picked from the disk-> its loaded into memory
# IN-MEMORY-> caching
# Model execution overlap with data processing!-> prefetching!

#Autotuning
autotune = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=autotune)
test_ds = test_ds.cache().prefetch(buffer_size=autotune)
val_ds = val_ds.cache().prefetch(buffer_size=autotune)


Learning-> Fully Connected Layers
# 2 models

# preprocess text -> Dense 

# preprocess text -> Dense + Dense 

how to preprocess the text?



In [50]:
HP_embedding_dim = 16
HP_dropout = 0.2 # regularization 
HP_hidden_outputs = 32
HP_epochs= 20
HP_batch_size = 32
HP_vocab_size = 10000

In [59]:

from tensorflow.keras import layers, losses, preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
l1 = tf.keras.layers.Embedding(HP_vocab_size, HP_embedding_dim)
l2 = layers.GlobalAveragePooling1D()
l3 = layers.Dense(HP_hidden_outputs, activation="relu")
l4 = layers.Dense(1)
layers_m1 = [l1,l2,l3,l4]
model = tf.keras.Sequential(layers_m1)

In [60]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 160,577
Trainable params: 160,577
Non-trainable params: 0
_________________________________________________________________


In [54]:
# Dense -> input * output + output 
# Dense -> inputs * vector_size + bias
# Dense -> 16*32 + 32 = 544
# Dense -> 32 * 1 + 1 = 33
# for embedding vector lookup-> bias = 0
# EMbedding -> input * output -> 10000 * 16 => 160,000

In [61]:
model.compile(loss= losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
history = model.fit(train_ds, validation_data=val_ds, epochs=HP_epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20

In [62]:
#new model with only 1 dense layer
l1 = tf.keras.layers.Embedding(HP_vocab_size, HP_embedding_dim)
l2 = tf.keras.layers.GlobalAveragePooling1D()
l3 = layers.Dense(1)
layers_m2 = [l1,l2,l3]
anothermodel = tf.keras.Sequential(layers_m2)

In [64]:
anothermodel.compile(loss= losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
history2 = anothermodel.fit(train_ds, validation_data=val_ds, epochs=HP_epochs)