In [19]:
import random
import numpy as np
import pandas as pd

In [71]:
class LogisticRegression_L1:
    
    class Model:
        """
        desc:训练出的模型
        """
        def __init__(self,theta):
            """
            desc:根据传入的 θ 构建模型 
            """
            self.theta = theta
        def predict(self,x_test):
            """
            desc:根据传入的特征,利用模型预测数据
            """
            t = np.dot(x_test, self.theta)
            # 得出类别为 1 的概率
            result = LogisticRegression_L1.sigmoid(t)
            # 将所有数据取整  p > 0.5 return: 1 , p <= 0.5 return: 0
            return np.round(result)
    
    def __init__(self,alpha,theta,valve,lamb,rand,batch_n=-1,max_iter=10**3):
        """
        :param: alpha: 学习率,也称梯度下降中每一步的步长
        :param: theta: 初始化的 θ 向量
        :param: valve: 阀值,训练过程中,如梯度值的绝对值小于阀值就会跳出训练
        :param: lamb: 正则化超参数（注意带默认值的参数要放在不带默认值参数的后面）
        :param: rand: 坐标下降的过程中是否采用随机选择坐标下降。若为False则为循环坐标下降
        :param: batch_n: 取一个整数表示微梯度下降中采用的样本数量,-1表示每次迭代采用全部数据
        :param: max_iter: 最大的迭代次数,超过时自动跳出训练
        """
        self.alpha = alpha
        self.theta = theta
        self.valve = valve
        self.lamb = lamb
        self.rand = rand
        self.batch_n = batch_n
        self.max_iter = max_iter
        
    
    def fit(self,x_data,y_data,coor):
        """
        :param x_data: 特征值
        :param y_data: 标签值
        :param coor: 初始的迭代坐标轴编号。取值为(0,self.theta.shape[0]-1)范围内任一整数（前后均包含）
        """
        iter_cnt = 0  #初始化迭代次数为0
        data_len = x_data.shape[0]  #样本量n
        # 验证样参数是否满足要求,并使其满足要求
        self.batch_n = data_len if self.batch_n <= 0 or self.batch_n > data_len else self.batch_n
        # 开始迭代
        while self.__coordinate_func(x_data,y_data,coor) and iter_cnt < self.max_iter:
            if self.rand == True:
                coor = random.randint(0,self.theta.shape[0]-1)
            else:
                if coor < self.theta.shape[0]-1:
                    coor += 1
                else:
                    coor = 0
           
            iter_cnt += 1  #可以用loss_list记录下损失
        return self.Model(self.theta)
    
    def __coordinate_func(self,x_data,y_data,coor):
        """
        :param x_data: 特征值
        :param y_data: 标签值
        """
        data_len = x_data.shape[0]  #样本量n
        fetature_len = self.theta.shape[0]  #特征数p

        j_ = 0
        # 遍历样本数据
        for i in range(0,data_len,round(data_len/self.batch_n)):  #按照批量进行计算
            # 计算θx_1 + θx_2 + ... + θx_n    也就是公式中的(θ^TX)
            t1 = np.dot(x_data[i], self.theta)
            # 计算预测值 h_θ(x_i)
            h_theta = LogisticRegression_L1.sigmoid(t1)
            # 结果累加起来 (h_θ(x_i) - y_i)x_ij
            j_ += (h_theta-y_data[i])*x_data[i][coor]
        # 除以处理的样本数量,也就是取均值
        j_ /= self.batch_n
        
        # 更新全局 θ 的值,也就是朝着下山的方向走一步
        para_change = self.alpha*j_ + self.lamb*np.sign(self.theta[coor])
        self.theta[coor] = self.theta[coor] - para_change
        # 判断参数theta的改变是否大于阈值
        return para_change >= self.valve
    
    @staticmethod
    def sigmoid(x):
        t = 1 + np.exp(-x)
        result = np.divide(1,t)
        return result

#### 用鸢尾花数据集进行验证

In [69]:
from sklearn.datasets import load_iris
iris_data = np.column_stack(load_iris(return_X_y=True))
# 这里我们实现的只是二分类算法,左右只需要类别0和类别1两种即可
iris_data = iris_data[iris_data[:,-1]<2]
# 将数据打乱
np.random.shuffle(iris_data)
# 将数据分成两份
train_data,test_data= np.split(iris_data,2,axis=0)
# 训练数据
train_x = train_data[:,:-1]
train_y = train_data[:,-1]
# 测试数据
test_x = test_data[:,:-1]
test_y = test_data[:,-1]

In [142]:
ini_theta = np.array([1]*train_x.shape[1],dtype=float)
coor = random.randint(0,ini_theta.shape[0]-1)
l = LogisticRegression_L1(0.01,ini_theta,0.000000001,0.01,rand=False,batch_n=train_x.shape[0],max_iter=1000)  #坐标梯度下降，batch_n应当取全部样本
m = l.fit(train_x,train_y,coor)
pred_y = m.predict(test_x)

0.011219953259276848
[1.         1.         1.         0.98878005]
0
1
0.03461906140287065
[0.96538094 1.         1.         0.98878005]
1
2
0.026659268980284256
[0.96538094 0.97334073 1.         0.98878005]
2
3
0.01729961923872122
[0.96538094 0.97334073 0.98270038 0.98878005]
3
4
0.011219936939311328
[0.96538094 0.97334073 0.98270038 0.97756011]
0
5
0.034618742949287185
[0.9307622  0.97334073 0.98270038 0.97756011]
1
6
0.026659020879313676
[0.9307622  0.94668171 0.98270038 0.97756011]
2
7
0.01729948897351205
[0.9307622  0.94668171 0.96540089 0.97756011]
3
8
0.011219914870090622
[0.9307622  0.94668171 0.96540089 0.96634019]
0
9
0.034618315931772135
[0.89614388 0.94668171 0.96540089 0.96634019]
1
10
0.02665868819944127
[0.89614388 0.92002302 0.96540089 0.96634019]
2
11
0.017299313848195372
[0.89614388 0.92002302 0.94810158 0.96634019]
3
12
0.011219885006016337
[0.89614388 0.92002302 0.94810158 0.95512031]
0
13
0.03461774314786235
[0.86152614 0.92002302 0.94810158 0.95512031]
1
14
0.0266

In [143]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

acc = accuracy_score(test_y,pred_y)
rec = recall_score(test_y,pred_y)
pre = precision_score(test_y,pred_y)
f1 = f1_score(test_y,pred_y)

print('预测准确率为:{:.4f}'.format(acc))
print('预测查准率为:{:.4f}'.format(pre))
print('预测召回率为:{:.4f}'.format(rec))
print('预测f1-score为:{:.4f}'.format(f1))

预测准确率为:0.4800
预测查准率为:0.4800
预测召回率为:1.0000
预测f1-score为:0.6486


In [144]:
pred_y

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

#### 用第五次作业数据进行验证

In [163]:
data = pd.read_csv('data.csv')

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 训练数据
train_x = train_data.iloc[:,1:].values 
train_y = np.array(train_data.iloc[:,0])
# 测试数据
test_x = test_data.iloc[:,1:].values
test_y = np.array(test_data.iloc[:,0])

In [161]:
ini_theta = np.array([1]*train_x.shape[1],dtype=float)
l = LogisticRegression_L1(0.01,ini_theta,0.0001,0.01,rand=False,batch_n=-1,max_iter=1000)  #坐标梯度下降，batch_n应当取全部样本
coor = random.randint(0,ini_theta.shape[0]-1)
m = l.fit(train_x,train_y,coor)
pred_y = m.predict(test_x)

0.014230031620933167
[1.         1.         1.         1.         0.98576997]
0
1
0.011283481738944232
[0.98871652 1.         1.         1.         0.98576997]
1
2
0.011807014379692126
[0.98871652 0.98819299 1.         1.         0.98576997]
2
3
0.013229272426984583
[0.98871652 0.98819299 0.98677073 1.         0.98576997]
3
4
0.014172262431050332
[0.98871652 0.98819299 0.98677073 0.98582774 0.98576997]
4
5
0.014224058870231138
[0.98871652 0.98819299 0.98677073 0.98582774 0.97154591]
0
6
0.011278377069678824
[0.97743814 0.98819299 0.98677073 0.98582774 0.97154591]
1
7
0.011801661634477476
[0.97743814 0.97639132 0.98677073 0.98582774 0.97154591]
2
8
0.013221261611824025
[0.97743814 0.97639132 0.97354947 0.98582774 0.97154591]
3
9
0.014165133700036735
[0.97743814 0.97639132 0.97354947 0.9716626  0.97154591]
4
10
0.01421791395332837
[0.97743814 0.97639132 0.97354947 0.9716626  0.957328  ]
0
11
0.01127310701041742
[0.96616503 0.97639132 0.97354947 0.9716626  0.957328  ]
1
12
0.0117961320933

In [162]:
acc = accuracy_score(test_y,pred_y)
rec = recall_score(test_y,pred_y)
pre = precision_score(test_y,pred_y)
f1 = f1_score(test_y,pred_y)

print('预测准确率为:{:.4f}'.format(acc))
print('预测查准率为:{:.4f}'.format(pre))
print('预测召回率为:{:.4f}'.format(rec))
print('预测f1-score为:{:.4f}'.format(f1))

预测准确率为:0.4500
预测查准率为:0.4118
预测召回率为:0.3684
预测f1-score为:0.3889


In [17]:
x = np.array([1]*train_x.shape[1])
coor = random.randint(0,x.shape[0]-1)
coor

3

In [152]:
from sklearn.linear_model import LogisticRegression

In [164]:
lr_l1 = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)
lr_l1.fit(train_x,train_y)
pred_y_lr = lr_l1.predict(test_x)

In [165]:
acc = accuracy_score(test_y,pred_y_lr)
rec = recall_score(test_y,pred_y_lr)
pre = precision_score(test_y,pred_y_lr)
f1 = f1_score(test_y,pred_y_lr)

print('预测准确率为:{:.4f}'.format(acc))
print('预测查准率为:{:.4f}'.format(pre))
print('预测召回率为:{:.4f}'.format(rec))
print('预测f1-score为:{:.4f}'.format(f1))

预测准确率为:0.7250
预测查准率为:0.6667
预测召回率为:0.8421
预测f1-score为:0.7442


In [166]:
lr_l1.coef_

array([[ 1.20261266,  1.42430799,  0.40336021, -1.10654598, -1.98588844]])