In [1]:
import matplotlib.pyplot as plt
import re
import os
import shutil
import tensorflow as tf
import string

from tensorflow.keras import layers, losses, preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
print(tf.__version__)

2.4.1


In [3]:
# data source at https://ai.stanford.edu/~amaas/data/sentiment/
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.', cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [4]:
os.listdir(dataset_dir)

['test', 'imdb.vocab', 'README', 'train', 'imdbEr.txt']

In [5]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'urls_neg.txt',
 'unsup',
 'urls_unsup.txt',
 'neg',
 'pos',
 'urls_pos.txt',
 'unsupBow.feat']

In [6]:
sampleFile = os.path.join(train_dir, 'pos/1181_9.txt')

In [7]:
with open(sampleFile) as f:
  print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


In [8]:
# remove all unsupported types
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [9]:
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=42
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [10]:
raw_train_ds.take(1)

<TakeDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [17]:
for text, label in raw_train_ds.take(1):
  for i in range(5):
    print("review: ", text.numpy()[i])
    print("label: ", label.numpy()[i])

review:  b"I went to see Fever Pitch with my Mom, and I can say that we both loved it. It wasn't the typical romantic comedy where someone is pining for the other, and blah blah blah... You weren't waiting for the climatic first kiss or for them to finally get together. It was more real, because you saw them through the relationship, rather than the whole movie be about them getting together. People could actually relate to the film, because it didn't seem like extraordinary circumstances, or impossible situations. It was really funny, and I think it was Jimmy Fallon's best performance. All in all... I would definitely recommend it!"
label:  1
review:  b"from the view of a NASCAR Maniac like I am, the movie is interesting. You can see many race cars from 1983. Even tough, the racing scenes are not that much realistic. But I have to admit, that I haven't seen any race before 1995, because before that time, they didn't show any NASCAR races in Germany)<br /><br />from the view of a Burt 

In [18]:
raw_train_ds.class_names[0]

'neg'

In [19]:
raw_train_ds.class_names[1]

'pos'

In [20]:
raw_train_ds.__dict__ # one way-> open entire object as dictionary

{'_batch_size': <tf.Tensor: shape=(), dtype=int64, numpy=32>,
 '_drop_remainder': <tf.Tensor: shape=(), dtype=bool, numpy=False>,
 '_graph_attr': <tensorflow.python.framework.ops.Graph at 0x7f5f22888710>,
 '_input_dataset': <ShuffleDataset shapes: ((), ()), types: (tf.string, tf.int32)>,
 '_options_attr': <tensorflow.python.data.ops.dataset_ops.Options at 0x7f5f2287c3d0>,
 '_self_name_based_restores': set(),
 '_self_saveable_object_factories': {},
 '_self_setattr_tracking': True,
 '_self_unconditional_checkpoint_dependencies': [TrackableReference(name='_variant_tracker', ref=<tensorflow.python.data.ops.dataset_ops._VariantTracker object at 0x7f5f23837250>)],
 '_self_unconditional_deferred_dependencies': {},
 '_self_unconditional_dependency_names': {'_variant_tracker': <tensorflow.python.data.ops.dataset_ops._VariantTracker at 0x7f5f23837250>},
 '_self_update_uid': -1,
 '_structure': (TensorSpec(shape=(None,), dtype=tf.string, name=None),
  TensorSpec(shape=(None,), dtype=tf.int32, name

In [22]:
print(list(dir(raw_train_ds))) # one way-> open entire object as directory

['_GeneratorState', '__abstractmethods__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_add_variable_with_custom_getter', '_apply_options', '_as_serialized_graph', '_batch_size', '_checkpoint_dependencies', '_consumers', '_deferred_dependencies', '_drop_remainder', '_flat_shapes', '_flat_structure', '_flat_types', '_functions', '_gather_saveables_for_checkpoint', '_graph', '_graph_attr', '_handle_deferred_dependencies', '_has_captured_ref', '_input_dataset', '_inputs', '_list_extra_dependencies_for_serialization', '_list_functions_for_serialization', '_lookup_dependency', '_map_resources', '_maybe_initialize_trackab