In [1]:
import random
import numpy as np
import pandas as pd

In [2]:
class LogisticRegression:
    
    class Model:
        """
        desc:训练出的模型
        """
        def __init__(self,theta):
            """
            desc:根据传入的 θ 构建模型 
            """
            self.theta = theta
        def predict(self,x_test):
            """
            desc:根据传入的特征,利用模型预测数据
            """
            t = np.dot(x_test, self.theta)
            # 得出类别为 1 的概率
            result = LogisticRegression.sigmoid(t)
            # 将所有数据取整  p > 0.5 return: 1 , p <= 0.5 return: 0
            return np.round(result)
    
    def __init__(self,alpha,theta,valve,batch_n=-1,max_iter=10**3):
        """
        :param: alpha: 学习率,也称梯度下降中每一步的步长
        :param: theta: 初始化的 θ 向量
        :param: valve: 阀值,训练过程中,如梯度值的绝对值小于阀值就会跳出训练
        :param: batch_n: 取一个整数表示微梯度下降中采用的样本数量,-1表示每次迭代采用全部数据
        :param: max_iter: 最大的迭代次数,超过时自动跳出训练
        """
        self.alpha = alpha
        self.theta = theta
        self.valve = valve
        self.batch_n = batch_n
        self.max_iter = max_iter
    
    def fit(self,x_data,y_data):
        """
        :param x_data: 特征值
        :param y_data: 标签值
        """
        iter_cnt = 0  #初始化迭代次数为0
        data_len = x_data.shape[0]  #样本量n
        # 验证样参数是否满足要求,并使其满足要求
        self.batch_n = data_len if self.batch_n <= 0 or self.batch_n > data_len else self.batch_n
        # 开始迭代
        while self.__gradient_func(x_data,y_data) and iter_cnt < self.max_iter:
            iter_cnt += 1
        return self.Model(self.theta)
    
    def __gradient_func(self,x_data,y_data):
        """
        :param x_data: 特征值
        :param y_data: 标签值
        """
        gradient_vector = np.zeros(self.theta.size)  #梯度向量每一步初始化为全0向量
        data_len = x_data.shape[0]  #样本量n
        fetature_len = self.theta.shape[0]  #特征数p
        
        # 遍历特征维度
        for t in range(fetature_len):
            j_ = 0
            # 遍历样本数据
            for i in range(0,data_len,round(data_len/self.batch_n)):  #按照批量进行计算
                # 计算θx_1 + θx_2 + ... + θx_n    也就是公式中的(θ^TX)
                t1 = np.dot(x_data[i], self.theta)
                # 计算预测值 h_θ(x_i)
                h_theta = LogisticRegression.sigmoid(t1)
                # 结果累加起来 (h_θ(x_i) - y_i)x_ij
                j_ += (h_theta-y_data[i])*x_data[i][t]
            # 除以处理的样本数量,也就是取均值
            j_ /= self.batch_n
            # 修改梯度下降方向向量中对应的值
            gradient_vector[t] = j_
        
        # 更新全局 θ 的值,也就是朝着下山的方向走一步
        self.theta = self.theta - self.alpha*gradient_vector
        # 输出梯度是否达到阀值
        return np.abs(gradient_vector.sum()) >= self.valve
    
    @staticmethod
    def sigmoid(x):
        t = 1 + np.exp(-x)
        result = np.divide(1,t)
        return result

In [3]:
from sklearn.datasets import load_iris
iris_data = np.column_stack(load_iris(return_X_y=True))
# 这里我们实现的只是二分类算法,左右只需要类别0和类别1两种即可
iris_data = iris_data[iris_data[:,-1]<2]
# 将数据打乱
np.random.shuffle(iris_data)
# 将数据分成两份
train_data,test_data= np.split(iris_data,2,axis=0)
# 训练数据
train_x = train_data[:,:-1]
train_y = train_data[:,-1]
# 测试数据
test_x = test_data[:,:-1]
test_y = test_data[:,-1]

In [5]:
l = LogisticRegression(0.04,np.array([1]*train_x.shape[1]),0.0003,batch_n=20,max_iter=1000)
m = l.fit(train_x,train_y)
pred_y = m.predict(test_x)

In [6]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

acc = accuracy_score(test_y,pred_y)
rec = recall_score(test_y,pred_y)
pre = precision_score(test_y,pred_y)
f1 = f1_score(test_y,pred_y)

print('预测准确率为:{:.4f}'.format(acc))
print('预测查准率为:{:.4f}'.format(pre))
print('预测召回率为:{:.4f}'.format(rec))
print('预测f1-score为:{:.4f}'.format(f1))

预测准确率为:1.0000
预测查准率为:1.0000
预测召回率为:1.0000
预测f1-score为:1.0000
