<a href="https://colab.research.google.com/github/ailunguo/Test/blob/main/Tensorflow_Test/%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86%E5%B1%82.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

文本向量化

In [1]:
import tensorflow as tf

In [2]:
# 实例
# 将文本转换为整数向量
text_dataset = tf.data.Dataset.from_tensor_slices(['foo', 'bar', 'baz'])
max_features = 5000 # Maximum vocab size
max_len = 4 # Sequence length to pad the outputs to

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=max_len)

vectorize_layer.adapt(text_dataset.batch(64))

model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)

input_data = [['foo qux bar'], ['qux baz']]
model.predict(input_data)



array([[2, 1, 4, 0],
       [1, 3, 0, 0]])

In [4]:
vocab_data = ['earth', 'wind', 'and', 'fire']
max_len = 4

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=max_len,
    vocabulary=vocab_data)

vectorize_layer.get_vocabulary()

['', '[UNK]', 'earth', 'wind', 'and', 'fire']

数值特征预处理层


归一化层

In [5]:
import numpy as np

In [6]:
# Normalization类
# 标准化层将数值缩放到以0为中心，标准差为1的分布
adapt_data = np.array([1., 2., 3., 4., 5.], dtype='float32')
input_data = np.array([1., 2., 3.], dtype='float32')
layer = tf.keras.layers.Normalization(axis=None)
layer.adapt(adapt_data) # 根据adapt_data中的统计信息初始化layer的权重和偏置
layer(input_data)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([-1.4142135 , -0.70710677,  0.        ], dtype=float32)>

In [8]:
# 计算最后一个轴上每个指数的均值和方差
adapt_data = np.array([[0., 7., 4.],
             [2., 9., 6.],
             [0., 7., 4.],
             [2., 9., 6.]], dtype='float32')
input_data = np.array([[0., 7., 4.]], dtype='float32')
layer = tf.keras.layers.Normalization(axis=-1)
layer.adapt(adapt_data)
layer(input_data)

<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[-1., -1., -1.]], dtype=float32)>

In [9]:
# 直接传递均值和方差
input_data = np.array([[1.], [2.], [3.]], dtype='float32')
layer = tf.keras.layers.Normalization(mean=3., variance=2.)
layer(input_data)

<tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[-1.4142135 ],
       [-0.70710677],
       [ 0.        ]], dtype=float32)>

In [10]:
# 使用该层对输入进行反规范化
adapt_data = np.array([[0., 7., 4.],
             [2., 9., 6.],
             [0., 7., 4.],
             [2., 9., 6.]], dtype='float32')
input_data = np.array([[1., 2., 3.]], dtype='float32')
layer = tf.keras.layers.Normalization(axis=-1, invert=True)
layer.adapt(adapt_data)
layer(input_data)

<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[ 2., 10.,  8.]], dtype=float32)>

离散化层

In [12]:
# Discretization类
# 根据提供的存储桶对浮点值进行存储
input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
layer = tf.keras.layers.Discretization(bin_boundaries=[0., 1., 2.])
layer(input)

<tf.Tensor: shape=(2, 4), dtype=int64, numpy=
array([[0, 2, 3, 1],
       [1, 3, 2, 1]])>

In [13]:
# 根据要计算的桶数对浮点数进行分桶
input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
layer = tf.keras.layers.Discretization(num_bins=4, epsilon=0.01)
layer.adapt(input)
layer(input)



<tf.Tensor: shape=(2, 4), dtype=int64, numpy=
array([[0, 2, 3, 2],
       [1, 3, 3, 1]])>

分类特征预处理层

类别编码层

In [14]:
# CategoryEncoding类
# 例子
# One-hot编码
layer = tf.keras.layers.CategoryEncoding(
    num_tokens=4, output_mode='one_hot')
layer([3, 2, 0, 1])

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)>

In [15]:
# Multi-hot编码
layer = tf.keras.layers.CategoryEncoding(
    num_tokens=4, output_mode='multi_hot')
layer([[0, 1], [0, 0], [1, 2], [3, 1]])

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[1., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 1., 0.],
       [0., 1., 0., 1.]], dtype=float32)>

In [17]:
# 'count'在模式中使用加权输入
layer = tf.keras.layers.CategoryEncoding(
    num_tokens=4, output_mode='count')
count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0.1, 0.2, 0. , 0. ],
       [0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0.3, 0. ],
       [0. , 0.2, 0. , 0.4]], dtype=float32)>

哈希层

In [18]:
# Hashing类
# FarmHash64
layer = tf.keras.layers.Hashing(num_bins=3)
inp = [['A'], ['B'], ['C'], ['D'], ['E']]
layer(inp)

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[1],
       [0],
       [1],
       [1],
       [2]])>

In [20]:
# 具有掩码值的实例
layer = tf.keras.layers.Hashing(num_bins=3, mask_value='')
inp = [['A'], ['B'], [''], ['C'], ['D']]
layer(inp)

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[1],
       [1],
       [0],
       [2],
       [2]])>

In [21]:
# SipHash64实例
layer = tf.keras.layers.Hashing(num_bins=3, salt=[133, 137])
# salt附加参数
inp = [['A'], ['B'], ['C'], ['D'], ['E']]
layer(inp)

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[1],
       [2],
       [1],
       [0],
       [2]])>

In [22]:
layer = tf.keras.layers.Hashing(num_bins=3, salt=133)
# salt附加参数
inp = [['A'], ['B'], ['C'], ['D'], ['E']]
layer(inp)

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[0],
       [0],
       [2],
       [1],
       [0]])>

散列交叉层

In [23]:
# HashedCrossing类
# 交叉两个标量特征
layer = tf.keras.layers.HashedCrossing(
    num_bins=5)
feat1 = tf.constant(['A', 'B', 'A', 'B', 'A'])
feat2 = tf.constant([101, 101, 101, 102, 102])
layer((feat1, feat2))

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 4, 1, 1, 3])>

In [24]:
# 交叉和单热两个标量特征
layer = tf.keras.layers.HashedCrossing(
    num_bins=5, output_mode='one_hot')
feat1 = tf.constant(['A', 'B', 'A', 'B', 'A'])
feat2 = tf.constant([101, 101, 101, 102, 102])
layer((feat1, feat2))

<tf.Tensor: shape=(5, 5), dtype=float32, numpy=
array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)>

StringLookup层

In [25]:
# StringLookup类
# 使用已知词汇创建查找层
# 此示例使用预先存在的词汇表创建一个查找层。
vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = tf.keras.layers.StringLookup(vocabulary=vocab)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[1, 3, 4],
       [4, 0, 2]])>

In [26]:
# 使用调整后的词汇表创建查找层
# 此示例创建一个查找图层并通过分析数据集生成词汇表。
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = tf.keras.layers.StringLookup()
layer.adapt(data)
layer.get_vocabulary()
# 请注意，OOV 标记"[UNK]"已添加到词汇表中。
# 其余标记按频率排序（"d"出现 2 次的为第一个），然后按逆排序顺序排序。



['[UNK]', 'd', 'z', 'c', 'b', 'a']

In [27]:
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = tf.keras.layers.StringLookup()
layer.adapt(data)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[5, 3, 1],
       [1, 2, 4]])>

In [28]:
# 具有多个OOV索引的查找
vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d"], ["m", "z", "b"]])
layer = tf.keras.layers.StringLookup(vocabulary=vocab,
                    num_oov_indices=2)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[2, 4, 5],
       [0, 1, 3]])>

In [29]:
# One-hot输出
vocab = ["a", "b", "c", "d"]
data = tf.constant(["a", "b", "c", "d", "z"])
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab, output_mode='one_hot')
layer(data)

<tf.Tensor: shape=(5, 5), dtype=float32, numpy=
array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)>

In [31]:
# multi-hot输出
vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab, output_mode='multi_hot')
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0., 1., 0., 1., 1.],
       [1., 0., 1., 0., 1.]], dtype=float32)>

In [32]:
# token count输出
vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab, output_mode='count')
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0., 1., 0., 1., 2.],
       [2., 0., 1., 0., 1.]], dtype=float32)>

In [33]:
# TF-IDF输出
vocab = ["a", "b", "c", "d"]
idf_weights = [0.25, 0.75, 0.6, 0.4]
data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
layer = tf.keras.layers.StringLookup(output_mode="tf_idf")
layer.set_vocabulary(vocab, idf_weights=idf_weights)
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
       [1.  , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>

In [34]:
# 要指定oov值的idf权重，您需要传递整个词汇表，包括前导oov标记。
vocab = ["[UNK]", "a", "b", "c", "d"]
idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
layer = tf.keras.layers.StringLookup(output_mode="tf_idf")
layer.set_vocabulary(vocab, idf_weights=idf_weights)
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
       [1.8 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>

In [35]:
# 逆向查找
vocab = ["a", "b", "c", "d"]
data = tf.constant([[1, 3, 4], [4, 0, 2]])
layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=string, numpy=
array([[b'a', b'c', b'd'],
       [b'd', b'[UNK]', b'b']], dtype=object)>

整数查找层

In [36]:
# IntegerLookup班级
# 例子
# 使用已知词汇创建查找层
# 此示例使用预先存在的词汇表创建一个查找层。
vocab = [12, 36, 1138, 42]
data = tf.constant([[12, 1138, 42], [42, 1000, 36]])  # Note OOV tokens
layer = tf.keras.layers.IntegerLookup(vocabulary=vocab)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[1, 3, 4],
       [4, 0, 2]])>

In [37]:
# 使用调整后的词汇表创建查找层
# 此示例创建一个查找图层并通过分析数据集生成词汇表。
data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
layer = tf.keras.layers.IntegerLookup()
layer.adapt(data)
layer.get_vocabulary()
# 请注意，OOV 标记 -1 已添加到词汇表中。
# 其余标记按频率排序（42，出现 2 次，排在第一位），然后按逆排序顺序。

[-1, 42, 1138, 1000, 36, 12]

In [38]:
data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
layer = tf.keras.layers.IntegerLookup()
layer.adapt(data)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[5, 2, 1],
       [1, 3, 4]])>

In [39]:
# 具有多个 OOV 索引的查找
vocab = [12, 36, 1138, 42]
data = tf.constant([[12, 1138, 42], [37, 1000, 36]])
layer = tf.keras.layers.IntegerLookup(
     vocabulary=vocab, num_oov_indices=2)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[2, 4, 5],
       [1, 0, 3]])>

In [40]:
# One-hot输出
vocab = [12, 36, 1138, 42]
data = tf.constant([12, 36, 1138, 42, 7]) # Note OOV tokens
layer = tf.keras.layers.IntegerLookup(
     vocabulary=vocab, output_mode='one_hot')
layer(data)

<tf.Tensor: shape=(5, 5), dtype=float32, numpy=
array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)>

In [41]:
# multi-hot输出
vocab = [12, 36, 1138, 42]
data = tf.constant([[12, 1138, 42, 42],
           [42, 7, 36, 7]]) # Note OOV tokens
layer = tf.keras.layers.IntegerLookup(
     vocabulary=vocab, output_mode='multi_hot')
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0., 1., 0., 1., 1.],
       [1., 0., 1., 0., 1.]], dtype=float32)>

In [42]:
# Token count输出
vocab = [12, 36, 1138, 42]
data = tf.constant([[12, 1138, 42, 42],
           [42, 7, 36, 7]]) # Note OOV tokens
layer = tf.keras.layers.IntegerLookup(
     vocabulary=vocab, output_mode='count')
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0., 1., 0., 1., 2.],
       [2., 0., 1., 0., 1.]], dtype=float32)>

In [43]:
# TF-IDF输出
vocab = [12, 36, 1138, 42]
idf_weights = [0.25, 0.75, 0.6, 0.4]
data = tf.constant([[12, 1138, 42, 42],
           [42, 7, 36, 7]]) # Note OOV tokens
layer = tf.keras.layers.IntegerLookup(
  output_mode='tf_idf', vocabulary=vocab, idf_weights=idf_weights)
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
       [1.  , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>

In [44]:
# 要指定 oov 标记的 idf 权重，您需要传递整个词汇表，包括前导 oov 标记。
vocab = [-1, 12, 36, 1138, 42]
idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
data = tf.constant([[12, 1138, 42, 42],
           [42, 7, 36, 7]]) # Note OOV tokens
layer = tf.keras.layers.IntegerLookup(
    output_mode='tf_idf', vocabulary=vocab, idf_weights=idf_weights)
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
       [1.8 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>

In [45]:
# 逆向查找
vocab = [12, 36, 1138, 42]
data = tf.constant([[1, 3, 4], [4, 0, 2]])
layer = tf.keras.layers.IntegerLookup(vocabulary=vocab, invert=True)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[  12, 1138,   42],
       [  42,   -1,   36]])>