This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.

**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**

This notebook was generated for TensorFlow 2.6.

# Deep learning for text

## Natural-language processing: The bird's eye view

## Preparing text data

### Text standardization

### Text splitting (tokenization)

### Vocabulary indexing

### Using the TextVectorization layer

In [2]:
import string

class Vectorizer:
    def standardize(self, text):
        '''
        標準化= 小寫化，去標點。
        '''
        text= text.lower()
        return "".join(char 
                       for char in text 
                       if char not in 
                       string.punctuation #.replace('+','')
                      )

    def tokenize(self, text):
        '''
        塊狀化
        '''
        text = self.standardize(text)
        return text.split()

    def make_vocabulary(self, dataset):
        '''
        建詞表，以供編碼、解碼
        '''
        
        self.vocabulary = {"": 0, "[UNK]": 1}
        
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict(
            (v, k) for k, v in self.vocabulary.items())

    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]

    def decode(self, int_sequence):
        return " ".join(
            self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

vectorizer= Vectorizer()
dataset= [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
    "I am writing, erasing, and rewriting",
    "Erasing again, and then",
    "A poppy is blooming.",
    "I am write +ing, erase +ing, and rewrite +ing",
    "Erase +ing again, and then",
    "A poppy is bloom +ing.",
    "我 寫 字 擦 掉 重 寫 。",
    "我 寫字 擦掉 重寫 。",
    "然 後 再 擦 掉 一 次 ，",
    "然後 再 擦掉 一次 ，",
]
vectorizer.make_vocabulary(dataset)
vectorizer.vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11,
 'am': 12,
 'writing': 13,
 'erasing': 14,
 'rewriting': 15,
 'is': 16,
 'blooming': 17,
 'ing': 18,
 'bloom': 19,
 '我': 20,
 '寫': 21,
 '字': 22,
 '擦': 23,
 '掉': 24,
 '重': 25,
 '。': 26,
 '寫字': 27,
 '擦掉': 28,
 '重寫': 29,
 '然': 30,
 '後': 31,
 '再': 32,
 '一': 33,
 '次': 34,
 '，': 35,
 '然後': 36,
 '一次': 37}

In [3]:
#string.punctuation.replace('+','')

In [4]:
test_sentence= "I write, rewrite, and still rewrite again"
test_sentence= "I am write +ing, and then run +ing."
#test_sentence= "我 寫字 然後 又 擦掉 。"
#test_sentence= "我 寫 字 然 後 又 擦 掉 。"
encoded_sentence= vectorizer.encode(test_sentence)
encoded_sentence

[2, 12, 3, 18, 7, 8, 1, 18]

In [5]:
decoded_sentence = vectorizer.decode(encoded_sentence)
decoded_sentence

'i am write ing and then [UNK] ing'

In [6]:
from tensorflow.keras.layers import TextVectorization

text_vectorization= TextVectorization(
    output_mode= "int",
)

In [7]:
import re
import string
import tensorflow as tf

def custom_standardization_fn(string_tensor):
    
    lowercase_string= tf.strings.lower(string_tensor)
    
    return tf.strings.regex_replace(
            lowercase_string, 
            f"[{re.escape(string.punctuation)}]", 
            "")

def custom_split_fn(string_tensor):
    
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode= "int",
    standardize= custom_standardization_fn,
    split=       custom_split_fn,
)

In [8]:
# dataset = [
#     "I write, erase, rewrite",
#     "Erase again, and then",
#     "A poppy blooms.",
# ]
# 
text_vectorization.adapt(dataset)


**Displaying the vocabulary**

In [9]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'ing',
 'and',
 'erase',
 'then',
 'poppy',
 'i',
 'again',
 'a',
 '，',
 '擦掉',
 '擦',
 '掉',
 '我',
 '寫',
 '再',
 '。',
 'write',
 'rewrite',
 'is',
 'erasing',
 'am',
 '重寫',
 '重',
 '然後',
 '然',
 '次',
 '後',
 '寫字',
 '字',
 '一次',
 '一',
 'writing',
 'rewriting',
 'blooms',
 'blooming',
 'bloom']

In [10]:
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 7 18 19  3  1 19  8], shape=(7,), dtype=int64)


In [11]:
inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


## Two approaches for representing groups of words: Sets and sequences

### Preparing the IMDB movie reviews data

In [12]:
# !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz

In [13]:
# !rm -r aclImdb/train/unsup

In [14]:
# !cat aclImdb/train/pos/4077_10.txt


In [15]:
# !rd /s /q aclImdb

In [16]:
# !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz

In [17]:
# !rm -r aclImdb/train/unsup

In [18]:
# !rd /s /q aclImdb
# !tar -xf aclImdb_v1.tar.gz
# !rd /s /q aclImdb\train\unsup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
!unzip -qq /content/drive/MyDrive/aclImdb_ry_zh.zip

In [20]:
import os, pathlib, shutil, random

base_dir=  pathlib.Path("aclImdb_ry_zh/aclImdb")
val_dir=   base_dir / "val"
train_dir= base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir / category, 
                exist_ok= True)
    files= os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    
    num_val_samples= int(0.2 * len(files))
    
    val_files= files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)

In [21]:
from tensorflow import keras
batch_size= 100

train_ds= keras.utils.text_dataset_from_directory(
    "aclImdb_ry_zh/aclImdb/train", 
    batch_size=batch_size
)
val_ds= keras.utils.text_dataset_from_directory(
    "aclImdb_ry_zh/aclImdb/val", 
    batch_size=batch_size
)
test_ds= keras.utils.text_dataset_from_directory(
    "aclImdb_ry_zh/aclImdb/test", 
    batch_size=batch_size
)

#'''
#Found 20000 files belonging to 2 classes.
#Found 5000 files belonging to 2 classes.
#Found 25000 files belonging to 2 classes.
#'''


Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [22]:
len(train_ds), len(val_ds), len(test_ds)

(700, 50, 250)

**Displaying the shapes and dtypes of the first batch**

In [23]:
import numpy as np
test_arr = np.array([1,3,5,7,9])
for x,y in enumerate(test_arr):
  print(x,end=' ')
  print(y,'\n')


0 1 

1 3 

2 5 

3 7 

4 9 



In [24]:
for n, (x, y) in enumerate(train_ds):
    print('n=',n,sep='')
    print('x.shape=',x.shape,' x.dtype=',x.dtype,sep='')
    print('y.shape=',y.shape,' y.dtype=',y.dtype,sep='')
    print('y=',y,sep='')
    print('x=',x,sep='')
    print('\n','='*200,'\n','='*200,'\n','='*200,'\n','='*200,'\n',sep='') # 等號分隔線

    # print(f'{n= }')
    # print(f"{x.shape= }, {x.dtype= }")
    # print(f"{y.shape= }, {y.dtype= }")
    # print(f"{y= }")
    # print(f"{x= }")
    if n==2: break

n=0
x.shape=(100,) x.dtype=<dtype: 'string'>
y.shape=(100,) y.dtype=<dtype: 'int32'>
y=tf.Tensor(
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 2 2 1 2 2 2 1 1 2 2 0 2 1 2 2 1 2 2 2
 2 2 2 0 1 1 2 2 2 2 2 2 2 2 0 2 1 2 0 2 2 2 2 2 2 0 2 2 2 1 2 2 2 1 0 2 2
 2 2 2 1 2 2 2 2 0 2 2 2 2 2 2 2 1 0 2 1 2 0 1 2 2 0], shape=(100,), dtype=int32)
x=tf.Tensor(
[b"What can I say? It has horrible acting, beyond amateurish dialogue, sub-par special effects and you know what.....I've seen it about 50 times! It is so campy and awful that it is good! I definitely think this ranks right up there with Showgirls as one of the all time so bad its good movies! Special mention goes to Dennis Quaid, one of my favorite actors, for looking like he's in on the joke!"
 b"This is the first video game I've ever rated or commented on, but this game was so cool I had to put in my two cents. I don't play video games much at all, but every two years or so I will get the jones to play a game. I'd say the past ten years or so it 

In [25]:
list(train_ds)[0][0][0:2], list(train_ds)[0][1][0:2]

(<tf.Tensor: shape=(2,), dtype=string, numpy=
 array([b"I got this movie out for the family, and yes there are unsettling bits in it, vis - the dog fight, the abuse, the orphanage, the tree scene, when you think about it probably the whole film. And why is that? Probably due to the verisimilitude of the film to the times, the grainy nature just added to the realism, the times were tough so lets not sugar coat the experiences of those who went through them. Disney probably made this film by mistake but i'm glad they did. And I just love a story where a wolf gets equal billing in the sensibility stakes, even if it's a dog actor. Good film, good acting, great trains and the U.S. countryside never looked more beautiful. Oh and the DVD did jitter but settled down after clean ?, and it was letterbox for the widescreen so I didn't have to suffer amputation. 10 - 10",
        b'if you loved the "wogs" series of the late 80\'s then you will probably think this is funny as well.<br /><br />Racia

### Processing words as a set: The bag-of-words approach

#### Single words (unigrams) with binary encoding

**Preprocessing our datasets with a `TextVectorization` layer**

In [30]:
from tensorflow.keras.layers import TextVectorization
max_tokens= 1000 #20000
max_tokens= 20000

text_vectorization= TextVectorization(
    max_tokens= max_tokens, #10000, #20000,
    output_mode= "multi_hot",
)

text_only_train_ds= train_ds.map(
    lambda x, y: x)

text_vectorization.adapt(
    text_only_train_ds)

binary_1gram_train_ds= train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

binary_1gram_val_ds= val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

binary_1gram_test_ds= test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [31]:
#list(text_only_train_ds)[0][0]

#text_vectorization.get_vocabulary()
#text_vectorization("I love you.")
list(binary_1gram_train_ds)[0]

(<tf.Tensor: shape=(100, 20000), dtype=float32, numpy=
 array([[1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        ...,
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(100,), dtype=int32, numpy=
 array([2, 2, 2, 2, 1, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2,
        0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 1,
        0, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 2, 0,
        2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2,
        2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2], dtype=int32)>)

**Inspecting the output of our binary unigram dataset**

for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

**Our model-building utility**

In [32]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens= max_tokens, #20000, 
              hidden_dim= 16):
    
    inputs=  keras.Input(shape=(max_tokens,))
    x=       layers.Dense(hidden_dim, activation="relu")(inputs)
    x=       layers.Dropout(0.5)(x)
    outputs= layers.Dense(1, activation="sigmoid")(x)
    model=   keras.Model(inputs, outputs)
    
    model.compile(
        optimizer="rmsprop",
        loss=     "binary_crossentropy",
        metrics=  ["accuracy"])
    
    return model

**Training and testing the binary unigram model**

In [33]:
model= get_model()
model.summary()

callbacks= [
    keras.callbacks.ModelCheckpoint(
        "binary_1gram.keras",
        save_best_only= True)
]

model.fit(
    binary_1gram_train_ds.cache(),
    validation_data= binary_1gram_val_ds.cache(),
    epochs=10,
    callbacks= callbacks)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0aaad4d0d0>

In [34]:
model= keras.models.load_model("binary_1gram.keras")

acc= model.evaluate(binary_1gram_test_ds)
print('acc=',acc,sep='')#print(f"{acc= }")


acc=[269.8370666503906, 0.5]


In [35]:
x= ["這 部 電影 爛透 了 。 不 要 浪費 時間 與 金錢 。",
    "太 棒 了 ， 演員 卡司 很 強大 。"
   ]
x= text_vectorization(x)
x

<tf.Tensor: shape=(2, 20000), dtype=float32, numpy=
array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [36]:
model.predict(x)



array([[0.9999969],
       [0.9999969]], dtype=float32)

#### Bigrams with binary encoding

**Configuring the `TextVectorization` layer to return bigrams**

In [37]:
max_tokens= 10000

text_vectorization= TextVectorization(
    ngrams= 2,
    max_tokens= max_tokens, #20000,
    #output_mode= "int", #"tf_idf", #"count", #"multi_hot",
    output_mode= "multi_hot"
)

text_vectorization.adapt(text_only_train_ds)

In [38]:
text_vectorization.get_vocabulary()

['[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'this',
 'i',
 'that',
 'br',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'of the',
 'film',
 'on',
 'you',
 'not',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'in the',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'from',
 'who',
 'so',
 'like',
 'her',
 'just',
 'or',
 'about',
 'has',
 'if',
 'out',
 'some',
 'there',
 'this movie',
 'what',
 'good',
 'when',
 'more',
 'very',
 'and the',
 'is a',
 'no',
 'up',
 'even',
 'the film',
 'she',
 'my',
 'would',
 'to the',
 'to be',
 'their',
 'which',
 'only',
 'time',
 'really',
 'story',
 'see',
 'were',
 'the movie',
 'can',
 'had',
 'this film',
 'me',
 'than',
 'it is',
 'we',
 'much',
 'well',
 'been',
 'this is',
 'get',
 'into',
 'also',
 'bad',
 'will',
 'on the',
 'great',
 'people',
 'do',
 'because',
 'other',
 'in a',
 'first',
 'most',
 'how',
 'him',
 'dont',
 'it was',
 'one of',
 'for the',
 'with the',
 'of a',
 'them',
 'then',
 'make',
 

In [39]:
text_vectorization('This movie is one of the best.')

<tf.Tensor: shape=(10000,), dtype=float32, numpy=array([0., 1., 0., ..., 0., 0., 0.], dtype=float32)>

**Training and testing the binary bigram model**

In [40]:
binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

model= get_model(max_tokens= max_tokens)
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_2gram.keras",
                                    save_best_only=True)
]

model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model= keras.models.load_model("binary_2gram.keras")

print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 10000)]           0         
                                                                 
 dense_4 (Dense)             (None, 16)                160016    
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.500


#### Bigrams with TF-IDF encoding

**Configuring the `TextVectorization` layer to return token counts**

In [41]:
max_tokens

10000

In [42]:
text_vectorization= TextVectorization(
    ngrams=2,
    max_tokens= max_tokens, #20000,
    output_mode="count"
)

**Configuring `TextVectorization` to return TF-IDF-weighted outputs**

In [43]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens= max_tokens, #20000,
    output_mode="tf_idf",
)

**Training and testing the TF-IDF bigram model**

In [44]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [45]:
tfidf_2gram_train_ds.cache(),tfidf_2gram_val_ds.cache(),tfidf_2gram_test_ds

(<CacheDataset element_spec=(TensorSpec(shape=(None, 10000), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>,
 <CacheDataset element_spec=(TensorSpec(shape=(None, 10000), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>,
 <ParallelMapDataset element_spec=(TensorSpec(shape=(None, 10000), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>)

In [46]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens= max_tokens, #20000, 
              hidden_dim= 16):
    
    inputs=  keras.Input(shape=(max_tokens,))
    x=       layers.Dense(hidden_dim, activation="relu")(inputs)
    x=       layers.Dropout(0.5)(x)
    outputs= layers.Dense(1, activation="sigmoid")(x)
    model=   keras.Model(inputs, outputs)
    
    model.compile(
        optimizer="rmsprop",
        loss=     "binary_crossentropy",
        metrics=  ["accuracy"])
    
    return model

In [47]:
model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("tfidf_2gram.keras",
                                    save_best_only=True)
]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          # batch_size = 10,
          epochs=10,
          callbacks=callbacks)
#model.fit(tfidf_2gram_train_ds.cache(),
#          validation_data=tfidf_2gram_val_ds.cache(),
#          epochs=10)

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 10000)]           0         
                                                                 
 dense_6 (Dense)             (None, 16)                160016    
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0aaa5a61d0>

In [48]:
model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

Test acc: 0.500


In [49]:
inputs = keras.Input(shape=(1,), dtype="string")
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = keras.Model(inputs, outputs)

In [50]:
import tensorflow as tf
raw_text_data = tf.convert_to_tensor([
    ["That was an excellent movie, I loved it."],
])
predictions = inference_model(raw_text_data)
print(f"{float(predictions[0] * 100):.2f} percent positive")
# 95.58 percent positive

100.00 percent positive
