In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

import os
import gzip
import shutil


df = pd.read_csv('C:/Users/ASUS/AI-ML/PycharmProjects/pythonProject/sentimentanalysisofIMDBmovie/movie_data.csv', encoding='utf-8')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Create a dataset

target = df.pop('sentiment')

ds_raw = tf.data.Dataset.from_tensor_slices(
    (df.values, target.values))


In [8]:
tf.random.set_seed(1)

ds_raw = ds_raw.shuffle(
    50000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [9]:
## Step 2: find unique tokens (words)

from collections import Counter

try:
    tokenizer = tfds.features.text.Tokenizer()
except AttributeError:
    tokenizer = tfds.deprecated.text.Tokenizer()
    
token_counts = Counter()

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
    
print('Vocab-size:', len(token_counts))

Vocab-size: 87063


In [10]:
## Step 3: encoding each unique token into integers

try:
    encoder = tfds.features.text.TokenTextEncoder(token_counts)
except AttributeError:
    encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

example_str = 'This is an example!'
encoder.encode(example_str)

[544, 40, 223, 2166]

In [11]:
## Step 3-A: define the function for transformation

def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

## Step 3-B: wrap the encode function to a TF Op.
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], 
                          Tout=(tf.int64, tf.int64))

In [12]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print('Sequence length:', example[0].shape)
    
example

Sequence length: (109,)
Sequence length: (177,)
Sequence length: (281,)
Sequence length: (199,)
Sequence length: (124,)


(<tf.Tensor: shape=(124,), dtype=int64, numpy=
 array([   86,  9018, 12346, 16862,    40,    46,   653,  3385,     8,
           46,    22,    71,   181,    33, 16863,   513,    17,     9,
         3032,  2844,     8,   401, 16864, 15670,  4170,   319,   319,
          337,   299,    51,    86,   837,    40, 16224, 16865, 16866,
         2871,    44,   270,  1719,    14, 11589, 10498,   116,  1072,
         1106,   223,   313,  4994,  2488,    16,    30,   483,    58,
          268,   183,   604,   105,     9,  1842,  8139,  4170,   219,
          299,   223,  2246,   545,   515,   972,   249,    46, 16867,
           46,  3622,  3974,  9073,  4405,    46,  3032,  3142,   972,
           14,    46,  6036,    44,     9, 16868,  2046,    14,    61,
          299,    46,  8701,  1029,   139,    46,  3275,    14,    46,
        16869,   214,  4891,   319,   319,   263,  2537,   299,  1183,
          374,  4104,  3797,  8139,  4170,   299, 16870,  8519,    14,
           13,   125,    44,  

In [13]:
## Take a small subset

ds_subset = ds_train.take(8)
for example in ds_subset:
    print('Individual size:', example[0].shape)

## batching the datasets
ds_batched = ds_subset.padded_batch(
    4, padded_shapes=([-1], []))

for batch in ds_batched:
    print('Batch dimension:', batch[0].shape)

Individual size: (236,)
Individual size: (250,)
Individual size: (193,)
Individual size: (310,)
Individual size: (143,)
Individual size: (258,)
Individual size: (214,)
Individual size: (117,)
Batch dimension: (4, 310)
Batch dimension: (4, 258)


In [14]:
## batching the datasets
train_data = ds_train.padded_batch(
    32, padded_shapes=([-1],[]))

valid_data = ds_valid.padded_batch(
    32, padded_shapes=([-1],[]))

test_data = ds_test.padded_batch(
    32, padded_shapes=([-1],[]))

In [15]:
from tensorflow.keras.layers import Embedding


model = tf.keras.Sequential()

model.add(Embedding(input_dim=100,
                    output_dim=6,
                    input_length=20,
                    name='embed-layer'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, 20, 6)             600       
                                                                 
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________
