In [44]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras, feature_column
from sklearn import model_selection
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell


In [2]:
# 配置项
# 这个要放到设置中文之前否则还是小方框
plt.style.use("seaborn")

# 指定默认字体 用来正常显示中文标签
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

# #全部行都能输出
InteractiveShell.ast_node_interactivity = "all"

In [3]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
data = pd.read_csv(URL)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [4]:
train, test = model_selection.train_test_split(data, test_size=0.2)
train, val = model_selection.train_test_split(train, test_size=0.2)

train.shape, val.shape, test.shape

((193, 14), (49, 14), (61, 14))

In [7]:
def df_to_dataset(df, shuffle=True, batch_size=32):
    df = df.copy()
    labels = df.target
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    return ds

In [8]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [9]:
train_ds

<BatchDataset shapes: ({age: (None,), sex: (None,), cp: (None,), trestbps: (None,), chol: (None,), fbs: (None,), restecg: (None,), thalach: (None,), exang: (None,), oldpeak: (None,), slope: (None,), ca: (None,), thal: (None,), target: (None,)}, (None,)), types: ({age: tf.int32, sex: tf.int32, cp: tf.int32, trestbps: tf.int32, chol: tf.int32, fbs: tf.int32, restecg: tf.int32, thalach: tf.int32, exang: tf.int32, oldpeak: tf.float32, slope: tf.int32, ca: tf.int32, thal: tf.string, target: tf.int32}, tf.int32)>

In [10]:
for feature_batch, label_batch in train_ds.take(2):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['age'])
  print('A batch of targets:', label_batch )
  print('\n')

Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
A batch of ages: tf.Tensor([61 35 34 59 54], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([1 0 0 1 0], shape=(5,), dtype=int32)


Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
A batch of ages: tf.Tensor([46 56 51 41 61], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 0 1 0 0], shape=(5,), dtype=int32)




In [11]:
example_batch = next(iter(train_ds))[0]
example_batch

{'age': <tf.Tensor: id=127, shape=(5,), dtype=int32, numpy=array([61, 35, 34, 59, 54], dtype=int32)>,
 'sex': <tf.Tensor: id=135, shape=(5,), dtype=int32, numpy=array([1, 1, 1, 1, 0], dtype=int32)>,
 'cp': <tf.Tensor: id=130, shape=(5,), dtype=int32, numpy=array([4, 4, 1, 4, 2], dtype=int32)>,
 'trestbps': <tf.Tensor: id=140, shape=(5,), dtype=int32, numpy=array([148, 120, 118, 140, 132], dtype=int32)>,
 'chol': <tf.Tensor: id=129, shape=(5,), dtype=int32, numpy=array([203, 198, 182, 177, 288], dtype=int32)>,
 'fbs': <tf.Tensor: id=132, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 1], dtype=int32)>,
 'restecg': <tf.Tensor: id=134, shape=(5,), dtype=int32, numpy=array([0, 0, 2, 0, 2], dtype=int32)>,
 'thalach': <tf.Tensor: id=139, shape=(5,), dtype=int32, numpy=array([161, 130, 174, 162, 159], dtype=int32)>,
 'exang': <tf.Tensor: id=131, shape=(5,), dtype=int32, numpy=array([0, 1, 0, 1, 1], dtype=int32)>,
 'oldpeak': <tf.Tensor: id=133, shape=(5,), dtype=float32, numpy=array([0. , 

In [12]:
def demo(feature_column):
    feature_layer = keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null object
target      303 non-null int64
dtypes: float64(1), int64(12), object(1)
memory usage: 33.2+ KB


In [14]:
# 将dataframe的serise
age = feature_column.numeric_column('age')
age
demo(age)

NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

[[61.]
 [35.]
 [34.]
 [59.]
 [54.]]


In [16]:
# bucketized_column 实际上跟pandas的分箱操作 + 独热编码
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
demo(age_buckets)


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]


In [24]:
# 将字符串特征列转化为独热编码
thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [26]:
# 如果特征类不是只有几个类型的值,而是有成千个的话,这时候就需要借助嵌入列
thal_embedding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)

[[-0.52247787  0.19819711 -0.44407883  0.10441691  0.19361396  0.34262082
  -0.16495323  0.2127146 ]
 [-0.52247787  0.19819711 -0.44407883  0.10441691  0.19361396  0.34262082
  -0.16495323  0.2127146 ]
 [ 0.267103    0.24413322  0.11034889  0.48925242  0.5364142  -0.28747085
  -0.11804456 -0.00707208]
 [-0.52247787  0.19819711 -0.44407883  0.10441691  0.19361396  0.34262082
  -0.16495323  0.2127146 ]
 [ 0.267103    0.24413322  0.11034889  0.48925242  0.5364142  -0.28747085
  -0.11804456 -0.00707208]]


In [27]:
# 哈希处理特征列
thal_hashed = feature_column.categorical_column_with_hash_bucket('thal', hash_bucket_size=1000)
demo(feature_column.indicator_column(thal_hashed))

W0819 14:10:07.085359 4665783744 deprecation.py:323] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: HashedCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [28]:
# 组合特征列
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))

W0819 14:13:15.845556 4665783744 deprecation.py:323] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: CrossedColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [42]:
feature_columns = []
# 数值列
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(header))

# 分桶列
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# 分类列
thal = feature_column.categorical_column_with_vocabulary_list('thal', 
                                                              vocabulary_list=['fixed', 'normal', 'reversible'])
feature_columns.append(feature_column.indicator_column(thal))

# 嵌入列
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# 组合列
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
feature_columns.append(feature_column.indicator_column(crossed_feature))


In [34]:
# 建立新的特征层
feature_layer = keras.layers.DenseFeatures(feature_columns)

batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)


In [41]:
# 建立模型训练
model = keras.Sequential([
  feature_layer,
  keras.layers.Dense(128, activation='relu'),
  keras.layers.Dense(128, activation='relu'),
  keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1a33b35cf8>

In [43]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.78688526
