In [1]:
import random
import numpy as np
import pandas as pd

In [2]:
class LogisticRegression_L1_PCD:
    
    class Model:
        """
        desc:训练出的模型
        """
        def __init__(self,theta):
            """
            desc:根据传入的 θ 构建模型 
            """
            self.theta = theta
        def predict(self,x_test):
            """
            desc:根据传入的特征,利用模型预测数据
            """
            t = np.dot(x_test, self.theta)
            # 得出类别为 1 的概率
            result = LogisticRegression_L1_PCD.sigmoid(t)
            # 将所有数据取整  p > 0.5 return: 1 , p <= 0.5 return: 0
            return np.round(result)
    
    def __init__(self,alpha,theta,valve,lamb,L,rand,batch_n=-1,max_iter=10**3):
        """
        :param: alpha: 学习率,也称梯度下降中每一步的步长
        :param: theta: 初始化的 θ 向量
        :param: valve: 阀值,训练过程中,如梯度值的绝对值小于阀值就会跳出训练
        :param: lamb: 正则化超参数（注意带默认值的参数要放在不带默认值参数的后面）
        :param: rand: 坐标下降的过程中是否采用随机选择坐标下降。若为False则为循环坐标下降
        :param: batch_n: 取一个整数表示微梯度下降中采用的样本数量,-1表示每次迭代采用全部数据
        :param: max_iter: 最大的迭代次数,超过时自动跳出训练
        """
        self.alpha = alpha
        self.theta = theta
        self.valve = valve
        self.lamb = lamb
        self.L = L
        self.rand = rand
        self.batch_n = batch_n
        self.max_iter = max_iter
        
    
    def fit(self,x_data,y_data,coor):
        """
        :param x_data: 特征值
        :param y_data: 标签值
        :param coor: 初始的迭代坐标轴编号。取值为(0,self.theta.shape[0]-1)范围内任一整数（前后均包含）
        """
        iter_cnt = 0  #初始化迭代次数为0
        data_len = x_data.shape[0]  #样本量n
        # 验证样参数是否满足要求,并使其满足要求
        self.batch_n = data_len if self.batch_n <= 0 or self.batch_n > data_len else self.batch_n
        # 开始迭代
        while self.__coordinate_func(x_data,y_data,coor) and iter_cnt < self.max_iter:
            if self.rand == True:
                coor = random.randint(0,self.theta.shape[0]-1)
            else:
                if coor < self.theta.shape[0]-1:
                    coor += 1
                else:
                    coor = 0
           
            iter_cnt += 1  #可以用loss_list记录下损失
        return self.Model(self.theta)
    
    def __coordinate_func(self,x_data,y_data,coor):
        """
        :param x_data: 特征值
        :param y_data: 标签值
        """
        data_len = x_data.shape[0]  #样本量n
        fetature_len = self.theta.shape[0]  #特征数p

        j_ = 0
        # 遍历样本数据
        for i in range(0,data_len,round(data_len/self.batch_n)):  #按照批量进行计算
            # 计算θx_1 + θx_2 + ... + θx_n    也就是公式中的(θ^TX)
            t1 = np.dot(x_data[i], self.theta)
            # 计算预测值 h_θ(x_i)
            h_theta = LogisticRegression_L1_PCD.sigmoid(t1)
            # 结果累加起来 (h_θ(x_i) - y_i)x_ij
            j_ += (h_theta-y_data[i])*x_data[i][coor]
        # 除以处理的样本数量,也就是取均值
        j_ /= self.batch_n
        para_change = j_/self.L
        z = self.theta[coor] - para_change

        if self.lamb/self.L < z:
            self.theta[coor] = z - self.lamb/self.L
        elif z < -self.lamb/self.L:
            self.theta[coor] = z + self.lamb/self.L
        else:
            self.theta[coor] = 0
        # 判断参数theta的改变是否大于阈值
        return np.abs(para_change) >= self.valve
    
    @staticmethod
    def sigmoid(x):
        t = 1 + np.exp(-x)
        result = np.divide(1,t)
        return result

#### 用鸢尾花数据集进行验证

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris_data = np.column_stack(load_iris(return_X_y=True))
# 这里我们实现的只是二分类算法,左右只需要类别0和类别1两种即可
iris_data = iris_data[iris_data[:,-1]<2]
# 将数据打乱
np.random.shuffle(iris_data)
# 将数据分成两份
train_x,test_x,train_y,test_y = train_test_split(iris_data[:,:-1],iris_data[:,-1],test_size=0.3,random_state=2)

In [4]:
ini_theta = np.array([1]*train_x.shape[1],dtype=float)
coor = random.randint(0,ini_theta.shape[0]-1)
l = LogisticRegression_L1_PCD(alpha=0.01,theta=ini_theta,valve=0.00001,lamb=0.01,L=1.5,rand=False,batch_n=-1,max_iter=1000)  #坐标梯度下降，batch_n应当取全部样本
m = l.fit(train_x,train_y,coor)
pred_y = m.predict(test_x)

In [6]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

acc = accuracy_score(test_y,pred_y)
rec = recall_score(test_y,pred_y)
pre = precision_score(test_y,pred_y)
f1 = f1_score(test_y,pred_y)

print('预测准确率为:{:.4f}'.format(acc))
print('预测查准率为:{:.4f}'.format(pre))
print('预测召回率为:{:.4f}'.format(rec))
print('预测f1-score为:{:.4f}'.format(f1))
print('模型beta参数估计为',m.theta)

预测准确率为:1.0000
预测查准率为:1.0000
预测召回率为:1.0000
预测f1-score为:1.0000
模型beta参数估计为 [-1.61742813 -0.46257661  3.5239059   0.56606718]


In [29]:
pred_y

array([1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
       1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1.])

In [30]:
from sklearn.linear_model import LogisticRegression

lr_l1 = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)
lr_l1.fit(train_x,train_y)
pred_y_lr = lr_l1.predict(test_x)

acc = accuracy_score(test_y,pred_y_lr)
rec = recall_score(test_y,pred_y_lr)
pre = precision_score(test_y,pred_y_lr)
f1 = f1_score(test_y,pred_y_lr)

print('预测准确率为:{:.4f}'.format(acc))
print('预测查准率为:{:.4f}'.format(pre))
print('预测召回率为:{:.4f}'.format(rec))
print('预测f1-score为:{:.4f}'.format(f1))
print('采用sklearn模型系数为',lr_l1.coef_)

预测准确率为:1.0000
预测查准率为:1.0000
预测召回率为:1.0000
预测f1-score为:1.0000
采用sklearn模型系数为 [[ 0.         -2.26425929  2.56759734  0.        ]]


#### 用第五次作业数据进行验证

In [7]:
data = pd.read_csv('data.csv')

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 训练数据
train_x = train_data.iloc[:,1:].values 
train_y = np.array(train_data.iloc[:,0])
# 测试数据
test_x = test_data.iloc[:,1:].values
test_y = np.array(test_data.iloc[:,0])

In [8]:
ini_theta = np.array([1]*train_x.shape[1],dtype=float)
l = LogisticRegression_L1_PCD(alpha=0.01,theta=ini_theta,valve=0.00001,lamb=0.01,L=1,rand=False,batch_n=-1,max_iter=1000)  #坐标梯度下降，batch_n应当取全部样本
coor = random.randint(0,ini_theta.shape[0]-1)
m = l.fit(train_x,train_y,coor)
pred_y = m.predict(test_x)

In [9]:
acc = accuracy_score(test_y,pred_y)
rec = recall_score(test_y,pred_y)
pre = precision_score(test_y,pred_y)
f1 = f1_score(test_y,pred_y)

print('预测准确率为:{:.4f}'.format(acc))
print('预测查准率为:{:.4f}'.format(pre))
print('预测召回率为:{:.4f}'.format(rec))
print('预测f1-score为:{:.4f}'.format(f1))
print('模型系数为',m.theta)

预测准确率为:0.7250
预测查准率为:0.7000
预测召回率为:0.7368
预测f1-score为:0.7179
模型系数为 [ 1.0821291   1.29101239  0.31384235 -0.88241532 -1.86872265]


In [34]:
from sklearn.linear_model import LogisticRegression

lr_l1 = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)
lr_l1.fit(train_x,train_y)
pred_y_lr = lr_l1.predict(test_x)

acc = accuracy_score(test_y,pred_y_lr)
rec = recall_score(test_y,pred_y_lr)
pre = precision_score(test_y,pred_y_lr)
f1 = f1_score(test_y,pred_y_lr)

print('预测准确率为:{:.4f}'.format(acc))
print('预测查准率为:{:.4f}'.format(pre))
print('预测召回率为:{:.4f}'.format(rec))
print('预测f1-score为:{:.4f}'.format(f1))
print('采用sklearn模型系数为',lr_l1.coef_)

预测准确率为:0.7250
预测查准率为:0.6667
预测召回率为:0.8421
预测f1-score为:0.7442
采用sklearn模型系数为 [[ 1.20277134  1.42406339  0.40366944 -1.10677098 -1.98585048]]
