# LASSO回归

## 基于Numpy的实现

In [1]:
import numpy as np

### 模型主体

In [2]:
### 定义符号函数
def sign(x):
    """
    输入:
    x: 浮点数值
    输出:
    整数数值
    """
    if x > 0:
        return 1
    elif x < 0:
        return -1
    else:
        return 0
# 对符号函数进行向量化
vec_sign = np.vectorize(sign)

In [18]:
### 定义LASSO回归损失函数
def l1_loss(X, y, w, b, alpha):
    """
    输入:
    X: 输入变量矩阵
    y: 输出标签向量
    w: 变量参数权重矩阵
    b: 偏置
    alpha: 正则化系数
    输出:
    y_hat: 线性模型预测输出
    loss: 均方损失值
    dw: 权重系数一阶偏导
    db: 偏置一阶偏导
    """
    # 训练样本量
    num_train = X.shape[0]
    # 回归模型预测输出
    y_hat = np.dot(X, w) + b
    # L1损失函数
    loss = np.sum((y_hat - y) ** 2) / num_train + alpha * np.sum(abs(w))
    # 基于向量化符号函数的参数梯度计算
    dw = 2 * np.dot(X.T, y_hat - y) / num_train + alpha * vec_sign(w)
    db = 2 * np.sum(y_hat - y) / num_train
    return y_hat, loss, dw, db

### 训练过程

In [4]:
### 初始化模型参数
def initialize_params(dims):
    """
    输入:
    dims: 训练数据变量维度
    输出:
    w: 初始化权重系数值
    b: 初始化偏置系数值
    """
    # 初始化权重系数为零向量
    w = np.zeros((dims, 1))
    # 初始化偏置系数为零
    b = 0
    return w, b

In [5]:
### 定义LASSO回归模型的训练过程
def lasso_train(X, y, learning_rate=0.01, epochs=1000):
    """
    输入:
    X: 输入变量矩阵
    y: 输出标签向量
    learning_rate: 学习率
    epochs: 训练迭代次数
    输出:
    loss_his: 每次迭代的L1损失列表
    params: 优化后的参数字典
    grads: 优化后的参数梯度字典
    """
    # 训练特征数
    num_feature = X.shape[1]
    # 初始化模型参数
    w, b = initialize_params(num_feature)
    # 记录训练损失的空列表
    loss_his = []
    # 迭代训练
    for i in range(1, epochs):
        # 计算当前迭代的预测值、损失和梯度
        y_hat, loss, dw, db = l1_loss(X, y, w, b, 0.1)
        # 基于梯度下降法的参数更新
        w -= learning_rate * dw
        b -= learning_rate * db
        # 记录当前迭代的损失
        loss_his.append(loss)
        # 每300次迭代打印当前损失信息
        if i % 300 == 0:
            print('epoch %d loss %f' % (i, loss))
        # 将当前迭代步优化后的参数保存到字典中
        params = {
            'w': w,
            'b': b
        }
        grads = {
            'dw': dw,
            'db': db
        }
    return loss_his, params, grads

### 数据测试

In [6]:
# 读取示例数据
data = np.genfromtxt('example.dat', delimiter=',')
data.shape

(101, 101)

In [7]:
data[0]

array([-1.14558, -1.29249,  0.84911,  0.36008,  0.26068,  2.51167,
        2.31855,  0.60805,  0.3428 , -0.28903,  0.70398,  1.18534,
       -1.44321, -0.72979, -0.06026, -0.50449, -0.3148 ,  0.94552,
       -0.32453, -0.09248,  0.84448,  0.32551,  0.57684, -0.12461,
       -0.5531 , -1.37074, -0.85719,  0.05802,  0.27486, -0.09269,
        0.28742, -1.29854,  0.66856,  0.02223,  0.28599, -0.28722,
        0.54304,  0.67301, -0.67343,  1.19857,  1.35595,  1.53356,
       -0.61245,  1.91698,  1.89642,  1.28004,  0.60073, -0.37792,
        0.34903,  2.17043, -1.52004, -0.75143, -0.54607, -0.46285,
        0.28424,  0.02458, -2.31083, -0.43165,  0.87581, -0.07552,
        0.66682,  1.65492, -0.71924,  1.39364,  0.5405 ,  0.55335,
        1.00319, -0.76292, -0.26241, -1.32323, -0.15256, -0.30767,
        0.37729,  0.02091,  1.05892, -0.30399,  1.76204, -1.38097,
        0.96642, -0.34674,  0.92688, -1.05426, -0.50192,  0.11954,
        0.97006,  2.01984,  0.00975,  0.65729, -1.01224,  0.93

In [8]:
# 选择特征与标签
x = data[:, 0:100]
y = data[:, 100].reshape(-1, 1)
# 加一列
X = np.column_stack((np.ones((x.shape[0], 1)), x))
# 划分训练集与测试集
X_train, y_train = X[:70], y[:70]
X_test, y_test = X[70:], y[70:]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(70, 101) (70, 1) (31, 101) (31, 1)


In [19]:
# 执行训练示例
loss_list, params, grads = lasso_train(X_train, y_train, 0.01, 3000)

epoch 300 loss 1.493162
epoch 600 loss 1.442268
epoch 900 loss 1.432396
epoch 1200 loss 1.429992
epoch 1500 loss 1.429493
epoch 1800 loss 1.429318
epoch 2100 loss 1.429034
epoch 2400 loss 1.428779
epoch 2700 loss 1.428903


In [20]:
# 设置NumPy数组显示格式为小数点后3位，且不采用科学计数法
# np.set_printoptions(precision=3, suppress=True)
# 获取训练参数
params

{'w': array([[-9.22702902e-04],
        [-8.25860170e-04],
        [ 6.46744693e-01],
        [ 6.68951202e-01],
        [ 6.17984707e-04],
        [ 1.01942048e+00],
        [-1.66905271e-03],
        [ 8.86360520e-01],
        [-4.07988397e-01],
        [ 3.99613172e-05],
        [ 8.62022139e-04],
        [ 8.43519764e-01],
        [-1.20409623e-04],
        [ 9.08904827e-01],
        [-4.09865995e-02],
        [-2.60022570e-01],
        [-1.62156628e-03],
        [ 6.66290418e-01],
        [ 1.11334411e-03],
        [ 8.56564513e-01],
        [ 3.09814641e-02],
        [ 1.61042865e-03],
        [ 2.15128183e-05],
        [ 5.59260279e-01],
        [-5.59365055e-04],
        [-3.94477432e-02],
        [-9.78579177e-04],
        [-3.78734945e-03],
        [-1.09087999e-03],
        [ 4.71287713e-01],
        [ 4.59012091e-04],
        [ 7.31624751e-04],
        [-4.77603403e-05],
        [ 1.02406516e-03],
        [-6.65114210e-04],
        [-1.28726319e-03],
        [-1.09959981e-0

## 基于sklearn的实现

In [11]:
from sklearn import linear_model

In [12]:
# 创建LASSO回归模型实例
sk_LASSO = linear_model.Lasso(alpha=0.1)
# 对训练集进行拟合
sk_LASSO.fit(X_train, y_train)
# 打印模型相关系数
print("sklearn LASSO intercept:", sk_LASSO.intercept_)
print("sklearn LASSO coefficients:", sk_LASSO.coef_)
print("sklearn LASSO number of iterations:", sk_LASSO.n_iter_)

sklearn LASSO intercept: [-0.23824815]
sklearn LASSO coefficients: [ 0.         -0.          0.59804516  0.64230683  0.          1.0070841
 -0.          0.81815409 -0.22756851  0.          0.          0.793601
  0.          0.74108335 -0.         -0.1250168  -0.          0.79407074
  0.          0.81867433  0.          0.         -0.          0.56664364
 -0.         -0.         -0.         -0.         -0.          0.49526526
  0.          0.          0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.          0.         -0.
  0.         -0.         -0.0078254   0.          0.         -0.
 -0.          0.01986066  0.         -0.          0.         -0.
  0.         -0.06797763  0.24581414  0.         -0.04180909 -0.
  0.10542471  0.03156005  0.          0.          0.         -0.
 -0.          0.         -0.          0.12548825  0.2340209  -0.
  0.          0.16875552  0.          0.01596168  0.         -0.
  0.          0.         -0.          0.20050804 -0