In [1]:
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import floor, log

pd.options.display.max_rows = 10

In [2]:
def dataProcess_X(rawData):
  # 先去除掉 sex，如果有 income 就去除掉， 没有就算了
  # 因为 sex 是伯努利分布， 就只将 男女 变成 0，1 就好
    if 'income' in rawData.columns:
        Data = rawData.drop(['sex', 'income'], axis=1)
    else:
        Data = rawData.drop(['sex'], axis=1)
    
  # 将 object 的列先获取出来， ‘object’ 就是字符串的意思。 一般来说这是类别属性， 离散随机变量
    listObjectColumn = [col for col in Data.columns if Data[col].dtypes == 'object']
  # 将非 object 的列获取出来， 那就是数字类型的。 连续型的随机变量
    listNonObjectColumn = [x for x in list(Data) if x not in listObjectColumn]
  # 将数据进行切分， 把连续型的数据和非连续性的数据切分开来
    ObjectData = Data[listObjectColumn]
    NonObjectData = Data[listNonObjectColumn]
  # 将之前的 sex 列 变成 01 插入
    NonObjectData.insert(0, 'sex', (rawData['sex'] == 'Female').astype(np.int))
  # 将离散随机变量独热编码
   ObjectData = pd.get_dummies(ObjectData)
  # 再将内容组合到一起
    Data = pd.concat([NonObjectData, ObjectData], axis=1)
    Data_x = Data.astype('int64')
  # 中心化内容
    Data_x = (Data_x - Data_x.mean()) / Data_x.std()
    return Data_x

In [3]:
def dataProcess_Y(rawData):
  df_y = rawData['income']
  Data_y = pd.DataFrame((df_y == '>50K').astype('int64'), columns=['income'])
  return Data_y

def sigmoid(z):
  res = 1 / (1.0 + np.exp(-z))
  # clip 的 目的是不让极端情况出现
  return np.clip(res, 1e-8, (1-(1e-8)))

# 将顺序打乱
def _shuffle(X, Y):
  randomize = np.arange(X.shape[0])
  np.random.shuffle(randomize)
  return (X[randomize], Y[randomize])

def split_valid_set(X, Y, percentage):
  all_size = X.shape[0]
  valid_size = int(floor(all_size * percentage))
  
  # 先打乱顺序， 再切分训练集和测试集
  X, Y = _shuffle(X, Y)
  X_valid, Y_valid = X[: valid_size], Y[: valid_size]
  X_train, Y_train = X[valid_size: ], Y[valid_size: ]
  
  return X_train, Y_train, X_valid, Y_valid

In [None]:
# 有了参数之后， 就能分析出sigmoid函数的预测结果啦
# 打印之就OK了
def valid(X, Y, w):
  a = np.dot(w, X.T)
  y = sigmoid(a)
  y_ = np.around(y)
  result = (np.squeeze(Y) == y_)
  print('acc = %f' % (float(result.sum()) / result.shape[0]))
  return y_

In [None]:
import matplotlib.pyplot as plt
# 小批量梯度下降法
def train(X_train, Y_train):
  
  lambda_2 = 0.3
  w = np.zeros(len(X_train[0]))
  l_rate = 0.03
  batch_size = 32
  train_dataz_size = len(X_train)
  step_num = int(floor(train_dataz_size / batch_size))
  epoch_num = 2000
  list_cost = []

  total_loss = 0.0
  for epoch in range(1, epoch_num):
      total_loss = 0.0
      X_train, Y_train = _shuffle(X_train, Y_train)

      for idx in range(1, step_num):
          X = X_train[idx*batch_size:(idx+1)*batch_size]
          Y = Y_train[idx*batch_size:(idx+1)*batch_size]

          s_grad = np.zeros(len(X[0]))


          z = np.dot(X, w)
          y = sigmoid(z)
          loss = y - np.squeeze(Y)

          cross_entropy = -1 * (np.dot(np.squeeze(Y.T), np.log(y)) + np.dot((1 - np.squeeze(Y.T)), np.log(1 - y)))/ len(Y)
          total_loss += cross_entropy

          grad = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size, 1)), axis=0)
          grad = np.dot(X.T, loss)
          # w = w - l_rate * grad - l_rate * lambda_2 * w
          # Adagrad
          s_grad += grad ** 2
          ada = np.sqrt(np.sum(s_grad))
          w = w - l_rate * grad / ada - (l_rate / ada) * lambda_2 * w

      list_cost.append(total_loss)

  # valid(X_valid, Y_valid, w)
  plt.plot(np.arange(len(list_cost)), list_cost)
  plt.title("Train Process")
  plt.xlabel("epoch_num")
  plt.ylabel("Cost Function (Cross Entropy)")
  plt.show()

  return w

In [None]:
trainData = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'], skipinitialspace=True)
testData = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'], skipinitialspace=True, skiprows=1)

In [None]:

# here is one more attribute in trainData
x_train = dataProcess_X(trainData).drop(['native-country_Holand-Netherlands'], axis=1).values
x_test = dataProcess_X(testData).values
y_train = dataProcess_Y(trainData).values
y_ans = dataProcess_Y(testData).values


x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)
x_train = np.concatenate((np.ones((x_train.shape[0], 1)),x_train), axis=1)

valid_set_percentage = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, valid_set_percentage)

w_train = train(X_train, Y_train)
print('Training ...')
valid(X_train, Y_train, w_train)

print('Testing ...')
y_ = valid(x_test, y_ans, w_train)
