# 线性回归

In [59]:
import pandas as pd
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [49]:
# 使用datasets.load_diabetes载入糖尿病数据
X, y = datasets.load_diabetes(return_X_y=True)
# 使用train_test_split将训练数据集和测试数据集分开
X_train, X_test,  y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

# 实例化模型
# 一般线性回归模型
lr = linear_model.LinearRegression()
# 使用训练数据（train_X, train_y）拟合模型
lr.fit(X_train, y_train)
# 使用lr_pred变量接收测试数据的预测结果
lr_pred = lr.predict(X_test)

In [82]:
# 岭回归：增加了正则项 L2，防止过拟合。
# linear_model.Ridge 手动设置alpha
# ridge_lr = linear_model.Ridge(alpha=0.5)

# linear_model.Ridge_cv 通过cross-validation, 在列表中选择最优alpha参数。
ridge_lr = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
ridge_lr.fit(X_train, y_train)
ridge_lr_pred = ridge_lr.predict(X_test)
ridge_lr.alpha_

  w = ((singvals_sq + alpha) ** -1) - (alpha ** -1)


0.001

In [101]:
# Lasso回归：增加了正则项 L1。
# lasso_lr = linear_model.Lasso(alpha=0.1)

lasso_lr = linear_model.LassoCV(alphas=np.logspace(-6, 6, 13))
lasso_lr.fit(X_train, y_train)
lasso_lr_pred = ridge_lr.predict(X_test)
lasso_lr.alpha_

0.01

In [54]:
# 模型中各个变量的系数
print("回归系数是： \n", lr.coef_)
print("截距是： ", lr.intercept_)
print("均方误差是： ", mean_squared_error(y_test, lr_pred))
print("r2 score: ", r2_score(y_test, lr_pred))

回归系数是： 
 [  -40.4344303   -255.5975599    561.20624771   346.6618945
 -1034.96857344   726.22785318   167.0441535    127.47639878
   830.27619811    51.23966951]
截距是：  153.71773657902813
均方误差是：  3461.6357411723743
r2 score:  0.417982093511879


In [104]:
df_coef = pd.DataFrame()
df_coef['linear'] = np.append(lr.intercept_, lr.coef_)
df_coef['ridge_linear'] = np.append(ridge_lr.intercept_, ridge_lr.coef_)
df_coef['lasso_linear'] = np.append(lasso_lr.intercept_, lasso_lr.coef_)

In [105]:
df_coef

Unnamed: 0,linear,ridge_linear,lasso_linear
0,153.717737,153.715523,153.723669
1,-40.43443,-39.045984,-29.480344
2,-255.59756,-254.150976,-243.153954
3,561.206248,562.008173,566.603069
4,346.661894,346.141859,339.508608
5,-1034.968573,-904.629698,-542.189152
6,726.227853,621.513754,330.606861
7,167.044153,111.103719,-38.164434
8,127.476399,114.344774,75.181117
9,830.276198,779.680089,643.162715


In [106]:
df_metric = pd.DataFrame(index=['linear',
                                'ridge_linear',
                                'lasso_linear'])
df_metric['mean_squared_error'] = [
    mean_squared_error(y_test, lr_pred),
    mean_squared_error(y_test, ridge_lr_pred),
    mean_squared_error(y_test, lasso_lr_pred)
]
df_metric['r2_score'] = [
    r2_score(y_test, lr_pred),
    r2_score(y_test, ridge_lr_pred),
    r2_score(y_test, lasso_lr_pred)
]

In [107]:
df_metric

Unnamed: 0,mean_squared_error,r2_score
linear,3461.635741,0.417982
ridge_linear,3448.12283,0.420254
lasso_linear,3448.12283,0.420254


# 逻辑回归