In [1]:
import keras

Using TensorFlow backend.


In [4]:
#单词级的one-hot编码
import numpy as np

#初始数据：每个样本是列表的一个元素
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

token_index={}  #构建数据中所有标记的索引
for sample in samples:
    for word in sample.split():     #利用split方法对样本进行分词，实际应用中还需从样本中去掉标记和特殊字符
        if word not in token_index:
            token_index[word] = len(token_index)+1       #为每个唯一单词指定一个唯一索引，没有为索引编号0指定单词

max_length = 10   #对样本进行分词，只考虑每个样本前max_length个单词

results = np.zeros((len(samples),max_length,max(token_index.values())+1))  #将结果保存在results

for i,sample in enumerate(samples):
    for j,word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i,j,index]=1 

In [5]:
#字符级的one-hot编码
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable                         # 所有可打印的ASCII字符
token_index = dict(zip(characters, range(1, len(characters) + 1)))

max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample[:max_length]):
        index = token_index.get(character)
        results[i, j, index] = 1.

In [6]:
#用Keras实现单词级的one-hot编码
from keras.preprocessing.text import Tokenizer
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer = Tokenizer(num_words=1000)        #创建一个分词器设置为只考虑前1000个最常用单词
tokenizer.fit_on_texts(samples)        #构建单词索引
sequences = tokenizer.texts_to_sequences(samples)    #将字符串转换为整数索引组成的列表
one_hot_results = tokenizer.texts_to_matrix(samples,mode='binary') #也可以直接得到one-hot二进制表示
word_index = tokenizer.word_index  #找回单词索引
print('Found %s unique tokens.' %len(word_index))

Found 9 unique tokens.


In [7]:
#使用散列技巧的单词级的one-hot编码
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
#将单词保存为长度为1000的向量，若单词数量接近1000个（或更多）那么会遇到很多散列冲突，会降低这种编码方法的准确性
dimensionality = 1000
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        #将单词散列为0-1000范围内的一个随机整数索引
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.