In [1]:
import tensorflow as tf

#### 数字列特征

In [18]:
import tensorflow as tf
# 特征列名
feature_names = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
# 特征数据
features = {'SepalLength':[[[1, 1], [1, 1]], [[2, 2], [2, 2]]],
            'SepalWidth': [[[4, 4], [4, 4]], [[5, 5], [5, 5]]],
            'PetalLength':[[[8, 8], [8, 8]], [[9, 9], [9, 9]]],
            'PetalWidth': [[[12, 12], [12, 12]], [[13, 13], [13, 13]]]}
# 定义函数
def fn(x):
    return x*1
# 构建数字特征列结构
feature_columns = [tf.feature_column.numeric_column(key=k,
                                                    shape=[2,2],
                                                    normalizer_fn=fn) for k in feature_names]
# 使用input_layer获取输入层数据
feature_tensor = tf.feature_column.input_layer(features=features, feature_columns=feature_columns)
with tf.Session() as sess:
    print("feature_tensor: ", feature_tensor)
    print(sess.run([feature_tensor])) # 转化为array输出
    

feature_tensor:  Tensor("input_layer_16/concat:0", shape=(2, 16), dtype=float32)
[array([[  8.,   8.,   8.,   8.,  12.,  12.,  12.,  12.,   1.,   1.,   1.,
          1.,   4.,   4.,   4.,   4.],
       [  9.,   9.,   9.,   9.,  13.,  13.,  13.,  13.,   2.,   2.,   2.,
          2.,   5.,   5.,   5.,   5.]], dtype=float32)]


#### 分区特征

In [20]:
import tensorflow as tf

# 特征数据
features = {'Year': [1956, 1963, 1978, 1984, 1986, 1989, 1990, 2017]}

# A numeric column for the raw input
numric_feature_column = tf.feature_column.numeric_column("Year")

# Bucketize the numeric column on the years 1960,1980 and 2000
bucketized_feature_column = tf.feature_column.bucketized_column(
    source_column=numric_feature_column,
    boundaries=[1960, 1980, 2000]) # 三边界元素创建一个四元素存储分区化矢量

# 使用input_layer获取输入层数据
feature_tensor = tf.feature_column.input_layer(
        features=features, 
        feature_columns=bucketized_feature_column)

with tf.Session() as sess:
    print("feature_tensor:", feature_tensor)
    print(sess.run([feature_tensor]))

feature_tensor: Tensor("input_layer_18/concat:0", shape=(8, 4), dtype=float32)
[array([[ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.]], dtype=float32)]


#### 分类标识列

#### 分类词汇列

In [4]:
import tensorflow as tf
from tensorflow.python.estimator.inputs import numpy_io
import numpy as np
import collections
from tensorflow.python.framework import errors
from tensorflow.python.platform import test
from tensorflow.python.training import coordinator
from tensorflow import feature_column

from tensorflow.python.feature_column.feature_column import _LazyBuilder

# color_data = {'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]}  # 4行样本
color_data = {'color':['R', 'G', 'B', 'A', 'C']}

builder = _LazyBuilder(color_data)

color_column = feature_column.categorical_column_with_vocabulary_list(
        key='color', 
        vocabulary_list=['R', 'G', 'B', 'A'], 
        dtype=tf.string, 
        default_value=-1)

color_column_tensor = color_column._get_sparse_tensors(builder)

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    print(session.run([color_column_tensor.id_tensor]))

# 将稀疏的转换成dense，也就是one-hot形式，只是multi-hot
color_column_identy = feature_column.indicator_column(color_column)
color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    print("\ncolor_dense_tensor", color_dense_tensor)
    print(session.run([color_dense_tensor]))

[SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0],
       [4, 0]], dtype=int64), values=array([ 0,  1,  2,  3, -1], dtype=int64), dense_shape=array([5, 1], dtype=int64))]

color_dense_tensor Tensor("input_layer_2/concat:0", shape=(5, 4), dtype=float32)
[array([[ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.]], dtype=float32)]


#### hash存储分区限制类别

In [6]:
import tensorflow as tf
from tensorflow.python.estimator.inputs import numpy_io
import numpy as np
import collections
from tensorflow.python.framework import errors
from tensorflow.python.platform import test
from tensorflow.python.training import coordinator
from tensorflow import feature_column

from tensorflow.python.feature_column.feature_column import _LazyBuilder

color_data = {'color': [['R'], ['G'], ['B'], ['A']]}  # 4行样本

builder = _LazyBuilder(color_data)

color_column = feature_column.categorical_column_with_hash_bucket('color', 5)

color_column_tensor = color_column._get_sparse_tensors(builder)

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    print(session.run([color_column_tensor.id_tensor]))

# 将稀疏的转换成dense，也就是one-hot形式，只是multi-hot
color_column_identy = feature_column.indicator_column(color_column)
color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    print("\ncolor_dense_tensor:", color_dense_tensor)
    print(session.run([color_dense_tensor]))

[SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0]], dtype=int64), values=array([3, 1, 3, 4], dtype=int64), dense_shape=array([4, 1], dtype=int64))]

color_dense_tensor: Tensor("input_layer_4/concat:0", shape=(4, 5), dtype=float32)
[array([[ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]], dtype=float32)]


上述结果为什么不是<font color=red>一个样例占据唯一的一行</font>

#### 交叉特征

组合特征，这仅仅适用于sparser特征.产生的依然是sparsor特征.

In [13]:
featrues = {'price': ['A', 'B', 'C', 'D'],
            'color': ['R', 'G', 'B', 'G']}

price = feature_column.categorical_column_with_vocabulary_list(
    'price',['A', 'B', 'C', 'D'])
color = feature_column.categorical_column_with_vocabulary_list(
    'color',['R', 'G', 'B'])

p_x_c = feature_column.crossed_column([price, color], 8)

p_x_c_identy = feature_column.indicator_column(p_x_c)

p_x_c_identy_dense_tensor = feature_column.input_layer(featrues, [p_x_c_identy])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    print("p_x_c_idengty_dense_tensor:", p_x_c_identy_dense_tensor)
    print(session.run([p_x_c_identy_dense_tensor]))

p_x_c_idengty_dense_tensor: Tensor("input_layer_11/concat:0", shape=(?, 8), dtype=float32)
[array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.]], dtype=float32)]


<font color=red>为什么显示的是4个样例</font>

#### 指示器列

In [15]:
 color_data = {'color': [['R'], ['G'], ['B'], ['A']]}  # 4行样本

color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], 
        dtype=tf.string, 
        default_value=-1)

color_embeding = feature_column.embedding_column(color_column, 3)
color_embeding_dense_tensor = feature_column.input_layer(color_data, [color_embeding])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    print("color_embeding_dense_tensor:", color_embeding_dense_tensor)
    print(session.run([color_embeding_dense_tensor]))

color_embeding_dense_tensor: Tensor("input_layer_13/concat:0", shape=(?, 3), dtype=float32)
[array([[-0.57735389, -0.34033021,  1.12373018],
       [-0.01710446,  0.29519728, -0.04550294],
       [ 0.52143204,  0.03952279, -0.55294937],
       [ 0.        ,  0.        ,  0.        ]], dtype=float32)]
