In [0]:
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import floor, log
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt

pd.options.display.max_rows = 10

In [0]:
def dataProcess_X(rawData):
  
  # 先去除掉 sex，如果有 income 就去除掉， 没有就算了
  # 因为 sex 是伯努利分布， 就只将 男女 变成 0，1 就好
  if 'income' in rawData.columns:
    Data = rawData.drop(['sex', 'income'], axis=1)
  else:
    Data = rawData.drop(['sex'], axis=1)
    
  # 将 object 的列先获取出来， ‘object’ 就是字符串的意思。 一般来说这是类别属性， 离散随机变量
  listObjectColumn = [col for col in Data.columns if Data[col].dtypes == 'object']
  # 将非 object 的列获取出来， 那就是数字类型的。 连续型的随机变量
  listNonObjectColumn = [x for x in list(Data) if x not in listObjectColumn]
  
  # 将数据进行切分， 把连续型的数据和非连续性的数据切分开来
  ObjectData = Data[listObjectColumn]
  NonObjectData = Data[listNonObjectColumn]
  
  # 将之前的 sex 列 变成 01 插入
  NonObjectData.insert(0, 'sex', (rawData['sex'] == 'Female').astype(np.int))
  
  # 将离散随机变量独热编码
  ObjectData = pd.get_dummies(ObjectData)
  
  # 再将内容组合到一起
  Data = pd.concat([NonObjectData, ObjectData], axis=1)
  Data_x = Data.astype('int64')
  
  # 中心化内容
  Data_x = (Data_x - Data_x.mean()) / Data_x.std()
  
  return Data_x

In [0]:
def dataProcess_Y(rawData):
  df_y = rawData['income']
  Data_y = pd.DataFrame((df_y == '>50K').astype('int64'), columns=['income'])
  return Data_y['income']

In [0]:
trainData = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'], skipinitialspace=True)
testData = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'], skipinitialspace=True, skiprows=1)

In [0]:
train_feature = dataProcess_X(trainData).drop(['native-country_Holand-Netherlands'], axis=1).values
test_feature = dataProcess_X(testData).values
train_label = dataProcess_Y(trainData).values
test_label = dataProcess_Y(testData).values

In [0]:
# 建立起来神经网络
baseline_model = keras.Sequential([
    # `input_shape` is only required here so that `.summary` works.
    keras.layers.Dense(1000, activation=tf.nn.relu, input_shape=(106,)),
    keras.layers.Dense(500, activation=tf.nn.relu),
    #keras.layers.Dropout(0.5),
    #keras.layers.Dense(16, activation=tf.nn.relu),
    #keras.layers.Dropout(0.5),
    #keras.layers.Dense(8, activation=tf.nn.relu),
    #keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation=tf.nn.sigmoid)
])

In [15]:
# 将神经网络进行参数设置

# Adam 是 一种目前公认的最好的自适应学习率的梯度下降法
baseline_model.compile(optimizer='adam',
                       # 指定了使用交叉熵作为损失函数
                       loss='binary_crossentropy',
                       metrics=['accuracy', 'binary_crossentropy'])

baseline_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 1000)              107000    
_________________________________________________________________
dense_11 (Dense)             (None, 500)               500500    
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 501       
Total params: 608,001
Trainable params: 608,001
Non-trainable params: 0
_________________________________________________________________


In [16]:
print(train_feature.shape)
print(train_label.shape)
# 训练模型
# model.fit(train_feature, train_label, epochs=5)
# test_loss, test_acc = model.evaluate(test_feature, test_label)
# print('Test accuracy:', test_acc)

baseline_history = baseline_model.fit(train_feature,
                                      train_label,
                                      epochs=20,
                                      batch_size=512,
                                      validation_data=(test_feature, test_label),
                                      verbose=2)

(32561, 106)
(32561,)
Train on 32561 samples, validate on 16281 samples
Epoch 1/20
32561/32561 - 3s - loss: 0.3557 - acc: 0.8342 - binary_crossentropy: 0.3557 - val_loss: 0.3857 - val_acc: 0.8377 - val_binary_crossentropy: 0.3857
Epoch 2/20
32561/32561 - 3s - loss: 0.3135 - acc: 0.8540 - binary_crossentropy: 0.3135 - val_loss: 0.5200 - val_acc: 0.7973 - val_binary_crossentropy: 0.5200
Epoch 3/20
32561/32561 - 3s - loss: 0.3028 - acc: 0.8578 - binary_crossentropy: 0.3028 - val_loss: 0.5547 - val_acc: 0.7912 - val_binary_crossentropy: 0.5547
Epoch 4/20
32561/32561 - 3s - loss: 0.2967 - acc: 0.8616 - binary_crossentropy: 0.2967 - val_loss: 0.5741 - val_acc: 0.7975 - val_binary_crossentropy: 0.5741
Epoch 5/20
32561/32561 - 3s - loss: 0.2925 - acc: 0.8635 - binary_crossentropy: 0.2925 - val_loss: 0.6063 - val_acc: 0.7855 - val_binary_crossentropy: 0.6063
Epoch 6/20
32561/32561 - 3s - loss: 0.2869 - acc: 0.8656 - binary_crossentropy: 0.2869 - val_loss: 0.6097 - val_acc: 0.7906 - val_binary_c

In [17]:
test_acc = baseline_model.evaluate(test_feature, test_label)
print(baseline_model.metrics_names)

print('Test accuracy:', test_acc)

['loss', 'acc', 'binary_crossentropy']
Test accuracy: [0.8841433085802768, 0.7992752, 0.88414353]
