In [1]:
from sklearn.datasets import load_boston

In [2]:
data = load_boston() # type(data):sklearn.utils.Bunch

* data.data.shape : (506, 13)
* data.target.shape: (506,)
* data.feature_names :array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
* type(data.data):numpy.ndarray
* type(data.target):numpy.ndarray

In [3]:
data.data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [4]:
from sklearn.model_selection import train_test_split
X=data.data
y=data.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=33)

* y.shape :(506,)
* y_train.shape:(379,)
* y_test.shape :(127,)

In [5]:
import numpy as np
print("The max target value is : "+str(np.max(data.target)))
print("The min target value is : "+str(np.min(data.target)))
print("The average target value is : "+str(np.mean(data.target)))
# 可以看到，预测目标 房价之间的差异较大，因此需要对feature ＆ target 进行标准化处理

The max target value is : 50.0
The min target value is : 5.0
The average target value is : 22.532806324110677


In [6]:
from sklearn.preprocessing import StandardScaler
ss_X=StandardScaler()
ss_y=StandardScaler()
X_train=ss_X.fit_transform(X_train)
X_test=ss_X.transform(X_test)

In [7]:
y_train=ss_y.fit_transform(y_train.reshape(-1,1))
y_test=ss_y.transform(y_test.reshape(-1,1)) 
# -1在这里应该可以理解为一个正整数通配符，它代替任何整数。
# 指定为-1的时候，其行或列会随机分配一个数据。

* y_train.shape:(379, 1)
* y_test.shape : (127, 1)
* y.reshape(-1,1).shape : (506, 1)

## 👇线性回归器 LinearRegression

In [8]:
from sklearn.linear_model import LinearRegression 
# 使用默认配置初始化 线性回归器 LinearRegression
lr=LinearRegression()

In [9]:
lr.fit(X_train,y_train)
lr_y_pred=lr.predict(X_test)

## 👇线性回归器 SGDRegressor

In [10]:
from sklearn.linear_model import SGDRegressor

# 使用默认配置初始化 线性回归器 SGDRegressor
sgdr=SGDRegressor(max_iter=1000, tol=1e-3)# tol是什么


In [11]:
y_train.shape

(379, 1)

In [12]:
# numpy.ravel : Return a contiguous flattened array.an array of the same subtype as a, with shape (a.size,)
sgdr.fit(X_train,y_train.ravel()) 
sgdr_y_pred=sgdr.predict(X_test)
# 版本不同，SGDRegressor所默认设置的迭代次数也不同，既然这样为了消除这个提醒，
# 我们可以在sgdr = SGDRegressor()中把迭代次数重新设置一下sgdr = SGDRegressor(max_iter=5)
# 本文设置最大迭代次数为5，也可以设置成1000，这个根据自己的需求设置


### 👇使用三种回归评测机制，以及两种调用R-squared评价模块的方法，对本节模型的回归性能做出评价

In [13]:
# 从sklearn.metrics依次导入r2_score,mean_squared_error,mean_absolute_error
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [14]:
# 使用LinearRegression模型自带的评估模块，并输出评估结果
print("The value of dafault measurement of LinearRegression is"+str(lr.score(X=X_test,y=y_test)))

# 使用r2_score模块，并输出评估结果
print('The value of R-squared of LinearRegression is : '+str(r2_score(y_true=y_test,y_pred=lr_y_pred)))

# 使用mean_squared_error模块，并输出评估结果
print('The mean squared error of LinearRegression is : '+str(mean_squared_error(y_true=ss_y.inverse_transform(y_test),
                                                                                y_pred=ss_y.inverse_transform(lr_y_pred))))
# 使用mean_absolute_error模块，并输出评估结果
print('The mean absolute error of LinearRegression is : '+str(mean_absolute_error(y_true=ss_y.inverse_transform(y_test),
                                                                                 y_pred=ss_y.inverse_transform(lr_y_pred))))

The value of dafault measurement of LinearRegression is0.675795501452948
The value of R-squared of LinearRegression is : 0.675795501452948
The mean squared error of LinearRegression is : 25.139236520353457
The mean absolute error of LinearRegression is : 3.5325325437053983


* 可以看出前两种是等价的，后续有关回归模型的评价，我们会都会采用第一种方式，即回归模型自带的评估模块来完成性能的评估
* 另外也可以看出，尽管三种评价方式R-squared,MSE,MAE在具体评估结果上不同，但是在评价总体优劣程度上的趋势上是一致的。

In [15]:
# 使用SGDRegressor 模型自带的评估模块，并输出评估结果
print('The value of default measurement of SGDRegressor is :'+str(sgdr.score(X_test,y_test)))

# 使用r2_score模块，并输出结果
print('The value of R-squared of Stochastic Gradient Descent Regressor is : '+str(r2_score(y_true=y_test,y_pred=sgdr_y_pred)))

# 使用mean_squared_error模块，并输出评估结果
print('The mean squared error of  Stochastic Gradient Descent Regressor is : '+str(mean_squared_error(y_true=ss_y.inverse_transform(y_test),
                                                                                y_pred=ss_y.inverse_transform(sgdr_y_pred))))
# 使用mean_absolute_error模块，并输出评估结果
print('The mean absolute error of  Stochastic Gradient Descent Regressor is : '+str(mean_absolute_error(y_true=ss_y.inverse_transform(y_test),
                                                                                 y_pred=ss_y.inverse_transform(sgdr_y_pred))))

The value of default measurement of SGDRegressor is :0.6690607414549097
The value of R-squared of Stochastic Gradient Descent Regressor is : 0.6690607414549097
The mean squared error of  Stochastic Gradient Descent Regressor is : 25.6614585291697
The mean absolute error of  Stochastic Gradient Descent Regressor is : 3.518428342898226
