In [0]:
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import floor, log

pd.options.display.max_rows = 10


In [0]:
def dataProcess_X(rawData):
  
  # 先去除掉 sex，如果有 income 就去除掉， 没有就算了
  # 因为 sex 是伯努利分布， 就只将 男女 变成 0，1 就好
  if 'income' in rawData.columns:
    Data = rawData.drop(['sex', 'income'], axis=1)
  else:
    Data = rawData.drop(['sex'], axis=1)
    
  # 将 object 的列先获取出来， ‘object’ 就是字符串的意思。 一般来说这是类别属性， 离散随机变量
  listObjectColumn = [col for col in Data.columns if Data[col].dtypes == 'object']
  # 将非 object 的列获取出来， 那就是数字类型的。 连续型的随机变量
  listNonObjectColumn = [x for x in list(Data) if x not in listObjectColumn]
  
  # 将数据进行切分， 把连续型的数据和非连续性的数据切分开来
  ObjectData = Data[listObjectColumn]
  NonObjectData = Data[listNonObjectColumn]
  
  # 将之前的 sex 列 变成 01 插入
  NonObjectData.insert(0, 'sex', (rawData['sex'] == 'Female').astype(np.int))
  
  # 将离散随机变量独热编码
  ObjectData = pd.get_dummies(ObjectData)
  
  # 再将内容组合到一起
  Data = pd.concat([NonObjectData, ObjectData], axis=1)
  Data_x = Data.astype('int64')
  
  # 中心化内容(标准化)
  Data_x = (Data_x - Data_x.mean()) / Data_x.std()
  
  return Data_x

In [0]:
def dataProcess_Y(rawData):
  df_y = rawData['income']
  Data_y = pd.DataFrame((df_y == '>50K').astype('int64'), columns=['income'])
  return Data_y

def sigmoid(z):
  res = 1 / (1.0 + np.exp(-z))
  # clip 的 目的是不让极端情况出现
  return np.clip(res, 1e-8, (1-(1e-8)))

# 将顺序打乱
def _shuffle(X, Y):
  randomize = np.arange(X.shape[0])
  np.random.shuffle(randomize)
  return (X[randomize], Y[randomize])

def split_valid_set(X, Y, percentage):
  all_size = X.shape[0]
  valid_size = int(floor(all_size * percentage))

  # 先打乱顺序， 再切分训练集和测试集
  X, Y = _shuffle(X, Y)
  X_valid, Y_valid = X[: valid_size], Y[: valid_size]
  X_train, Y_train = X[valid_size: ], Y[valid_size: ]

  return X_train, Y_train, X_valid, Y_valid

In [0]:
def valid(X, Y, mu1, mu2, shared_sigma, N1, N2):
  
  # 矩阵求逆是很耗时间的， 这里可有别的办法吗？
  # 哈哈， 告诉你答案吧， 因为协方差矩阵是对称矩阵
  # 所以求逆的好方法是使用 特征值分解，然后对对角矩阵求逆
  # 最后再生成逆矩阵
  #
  # 这里有特征值分解的视频（联系我要优惠券哦，相当于免费）：
  # https://edu.csdn.net/course/detail/24474
  sigma_inv = inv(shared_sigma)
  
  # 下面的这一段是推导出的内容，实际的可以看上课的笔记
  # 利用贝叶斯分类器
  w = np.dot((mu1-mu2), sigma_inv)
  X_t = X.T
  b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(float(N1)/N2)
  a = np.dot(w, X_t) + b
  y = sigmoid(a)
  
  # around 相当于是四舍五入
  #>>> np.around([0.37, 1.64])
  #array([ 0.,  2.])
  y_ = np.around(y)
  
  # 将Y拉伸成向量
  # >>> x = np.array([[[0], [1], [2]]])
  # >>> x.shape
  #(1, 3, 1)
  #
  result = (np.squeeze(Y) == y_)
  
  print('acc = %f' % (float(result.sum()) / result.shape[0]))
  
  return

    

In [0]:
def train(X_train, Y_train):
  
  train_data_size = X_train.shape[0]
  
  cnt1 = 0
  cnt2 = 0
  
  mu1 = np.zeros((106,))
  mu2 = np.zeros((106,))

  # 将所有的数据一行一行的加到 mu 上
  # 然后除以个数就是平均值
  for i in range(train_data_size):
    if Y_train[i] == 1:     # >50k
      mu1 += X_train[i]
      cnt1 += 1
    else:
      mu2 += X_train[i]
      cnt2 += 1
  mu1 /= cnt1
  mu2 /= cnt2
  # 你可以将打印打开看下， mu 是什么数据
  #   print(mu1)
  
  sigma1 = np.zeros((106, 106))
  sigma2 = np.zeros((106, 106))
  
  # 这里属于是求解协方差矩阵， 能够生成两个sigma，实际
  # 的生产中， 两个 sigma 加权平均是最好的共用的 协方差矩阵，
  # 所以最后会生成一个共享的协方差矩阵。
  for i in range(train_data_size):
    if Y_train[i] == 1:
      sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
    else:
      sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])
      
  sigma1 /= cnt1
  sigma2 /= cnt2
  
  # 生成共享的协方差矩阵
  shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2
  
  N1 = cnt1
  N2 = cnt2
  
  return mu1, mu2, shared_sigma, N1, N2

In [0]:
trainData = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'], skipinitialspace=True)
testData = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'], skipinitialspace=True, skiprows=1)
trainData

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [0]:

#here is one more attribute in trainData
x_train = dataProcess_X(trainData).drop(['native-country_Holand-Netherlands'], axis=1).values
x_test = dataProcess_X(testData).values
y_train = dataProcess_Y(trainData).values
y_ans = dataProcess_Y(testData).values

vaild_set_percetange = 0.1

X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, vaild_set_percetange)
mu1, mu2, shared_sigma, N1, N2 = train(X_train, Y_train)
print('Train ...')
valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2)

print('Test ...')
valid(x_test, y_ans, mu1, mu2, shared_sigma, N1, N2)

Train ...
acc = 0.848587
Test ...
acc = 0.815306
