# Chapter 8 Tokenization and Serialization 词条化和序列化

## Sector 8.1 Tokenization 词条化

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
sentence = [
    'I love my dog',
    'I log my cat'
]
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'my': 2, 'love': 3, 'dog': 4, 'log': 5, 'cat': 6}


## Sector 8.2 Serialization 序列化

In [6]:
# 基础的序列化模块(Tokenizer)使用 
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'I love dog',
    'I love cats',
    'You love my dog?',
    'Do you think my dog is amazing!'
]

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequence = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequence)

testdata = [
    'I really love my dog',
    'my dog loves my sisiter'
]

test_seq = tokenizer.texts_to_sequences(testdata)
print(test_seq)


{'love': 1, 'dog': 2, 'i': 3, 'you': 4, 'my': 5, 'cats': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[3, 1, 2], [3, 1, 6], [4, 1, 5, 2], [7, 4, 8, 5, 2, 9, 10]]
[[3, 1, 5, 2], [5, 2, 5]]


In [7]:
# 增加OOV_TOKEN解决测试数据集单词不在训练集字典里面
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'I love dog',
    'I love cats',
    'You love my dog?',
    'Do you think my dog is amazing!'
]

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequence = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequence)

testdata = [
    'I really love my dog',
    'my dog loves my sisiter'
]

test_seq = tokenizer.texts_to_sequences(testdata)
print(test_seq)


{'<OOV>': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'my': 6, 'cats': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[4, 2, 3], [4, 2, 7], [5, 2, 6, 3], [8, 5, 9, 6, 3, 10, 11]]
[[4, 1, 2, 6, 3], [6, 3, 1, 6, 1]]


In [1]:
# 增加padding模块，使序列等长，便于机器学习
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love dog',
    'I love cats',
    'You love my dog?',
    'Do you think my dog is amazing!'
]

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequence = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequence, padding = 'post', maxlen = 4, truncating = 'post')

print(word_index)
print(sequence)
print(pad_sequences(sequence))
testdata = [
    'I really love my dog',
    'my dog loves my sisiter'
]

test_seq = tokenizer.texts_to_sequences(testdata)
print(test_seq)


{'<OOV>': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'my': 6, 'cats': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[4, 2, 3], [4, 2, 7], [5, 2, 6, 3], [8, 5, 9, 6, 3, 10, 11]]
[[ 0  0  0  0  4  2  3]
 [ 0  0  0  0  4  2  7]
 [ 0  0  0  5  2  6  3]
 [ 8  5  9  6  3 10 11]]
[[4, 1, 2, 6, 3], [6, 3, 1, 6, 1]]
