# 用keras做文字的one-hot encoding

In [2]:
import tensorflow as tf
from tensorflow import keras

tf.__version__

'2.1.0'

In [4]:
from keras.preprocessing.text import Tokenizer

sample = ['The cat sat on the mat.', 'The dog ate my homework', 'The bitch yelled on the street']

tokenizer = Tokenizer(num_words=1000)#處理前1000個最常用單字
tokenizer.fit_on_texts(sample)#建立字典，依頻率決定順序，0不使用要保留
sequences = tokenizer.texts_to_sequences(sample)#把單字轉換成鍵值

print(sequences)

[[1, 3, 4, 2, 1, 5], [1, 6, 7, 8, 9], [1, 10, 11, 2, 1, 12]]


In [7]:
one_hot_result=tokenizer.texts_to_matrix(sample,mode='binary') #向量化

print(one_hot_result.shape)
print(one_hot_result[0][:15])
print(one_hot_result[1][:15])
print(one_hot_result[2][:15])

(3, 1000)
[0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.]


# 用keras的Embeding Layer實作文字嵌入法

In [8]:
embeding_layer = keras.layers.Embedding(1000,64) #最多1000字，輸出的嵌入向量維數64

In [13]:
from keras.datasets import imdb
from keras import preprocessing

max_features = 1000 #最常用文字數量
maxlen = 20 #文章最後的20字

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(x_train.shape)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)  # ←將整數 lists 轉換為 2D 整數張量, 形狀為(樣本數 samples, 最大長度 maxlen)
print(x_train.shape)
print(x_train[0])
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

(25000,)
(25000, 20)
[ 65  16  38   2  88  12  16 283   5  16   2 113 103  32  15  16   2  19
 178  32]


In [14]:
model=keras.Sequential([
    keras.layers.Embedding(1000,8,input_length=maxlen),
    keras.layers.Flatten(),
    keras.layers.Dense(1,activation='sigmoid')
])

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, 
                    y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             8000      
_________________________________________________________________
flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 161       
Total params: 8,161
Trainable params: 8,161
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## 這種Embedding不考慮文字順序和句子結構