In [74]:
# 逻辑回归多分类
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import scipy.io as sio

In [75]:
# 读取数据
# 一共15000条数据，70%作为训练集，30%作为测试集
# 分层来划分，每个志愿者的前面7次写的字作为训练集，后面3次写的作为测试集
train_X = np.zeros((10500, 64*64))
train_Y = np.zeros((10500, 15))
test_X = np.zeros((4500, 64*64))
test_Y = np.zeros((4500, 15))

train_id = 0
test_id = 0
# i表示志愿者序号，j表示写的次数序号，k表示写的字的分类标签；具体可以看csv
for i in range(1, 101):
    for j in range(1, 11):
        for k in range(1, 16):
            img = mpimg.imread('./data/input_' + str(i) + '_' + str(j) + '_' + str(k) + '.jpg')
            img = np.reshape(img, (64*64))
            if j < 8:
                train_X[train_id] = img
                train_Y[train_id][k-1] = 1
                train_id += 1
            else:
                test_X[test_id] = img
                test_Y[test_id][k-1] = 1
                test_id += 1

In [76]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def costFunction(theta,X,y,lamda):
    A=sigmoid(X@theta)
    
    first=y*np.log(A)
    second=(1-y)*np.log(1-A)
    
    reg=theta[1:]@theta[1:]*(lamda/(2*len(X)))
    return -np.sum(first+second)/len(X)+reg

def gradient_reg(theta,X,y,lamda):
    reg=theta[1:]*(lamda/len(X))
    reg=np.insert(reg,0,values=0,axis=0)
    
    first=(X.T@(sigmoid(X@theta)-y))/len(X)
    
    return first+reg

In [77]:
X = train_X
y = np.argmax(train_Y, axis=1)

In [78]:
X = np.insert(X,0,values=1,axis=1)
X.shape

(10500, 4097)

In [79]:
y.shape

(10500,)

In [80]:
from scipy.optimize import minimize

def one_vs_all(X,y,lamda,k):
    n=X.shape[1]
    
    theta_all=np.zeros((K,n))
    
    for i in range(1,K+1):
        theta_i=np.zeros(n,)
        
        res = minimize(fun=costFunction,
                       x0=theta_i,
                       args=(X,y==i,lamda),
                       method='TNC',
                       jac=gradient_reg)
        theta_all[i-1,:]=res.x
        
    return theta_all

In [81]:
lamda=600000
K=15
theta_final=one_vs_all(X,y,lamda,K)
theta_final

array([[ 1.78876549e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.39178327e-07, -3.09132528e-07, -1.04139781e-07],
       [-3.76458598e-01,  0.00000000e+00,  0.00000000e+00, ...,
         2.02881918e-06,  4.18962802e-07, -5.10252787e-06],
       [-1.15687811e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -4.36517818e-06, -4.65821755e-06, -2.78160902e-06],
       ...,
       [-2.84940232e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -4.21979235e-08, -4.82040371e-07, -4.45446508e-06],
       [-3.69056157e+00,  0.00000000e+00,  0.00000000e+00, ...,
         4.69806793e-06,  5.03359914e-06,  2.62859568e-06],
       [-2.09456603e+01,  0.00000000e+00,  0.00000000e+00, ...,
        -5.77053092e-14,  1.36648802e-13, -4.23202177e-14]])

In [82]:
def predict(X,theta_final):
    
    h = sigmoid(X@theta_final.T)
    
    h_argmax=np.argmax(h,axis=1)
    
    return h_argmax+1

In [83]:
y_pred=predict(X,theta_final)
acc=np.mean(y_pred==y)
print("训练集验证的准确度为：")
acc

训练集验证的准确度为：


0.548952380952381

In [84]:
test_X = np.insert(test_X,0,values=1,axis=1)
test_Y = np.argmax(test_Y, axis=1)

In [85]:
y_pred=predict(test_X,theta_final)
acc=np.mean(y_pred==test_Y)
print("测试集验证的准确度为：")
acc

测试集验证的准确度为：


0.44622222222222224