In [30]:
#美国波士顿地区房价数据描述

#从sklearn.datasets导入波士顿房价数据读取器
from sklearn.datasets import load_boston
#从读取房价数据存储在变量boston中
boston=load_boston()
#输出数据描述
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [27]:
#美国波士顿地区房价数据分割

#从sklearn.model_selection导入数据分割器
from sklearn.model_selection import train_test_split

#导入numpy并重命名为np
import numpy as np

X=boston.data
y=boston.target

#随机采样25%的数据构建测试样本，其余作为训练样本
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=33,test_size=0.25)

#分析回归目标值的差异
print("The max target value is",np.max(boston.target))
print("The min target value is",np.min(boston.target))
print("The average target value is",np.mean(boston.target))

The max target value is 50.0
The min target value is 5.0
The average target value is 22.532806324110677


In [3]:
#训练与测试数据标准化处理

#从sklearn.preprocessing导入数据标准化模块
from sklearn.preprocessing import StandardScaler

#分别初始化对特征和目标值的标准化器
ss_X=StandardScaler()
ss_y=StandardScaler()

#分别对训练和测试数据的特征以及目标值进行标准化处理
X_train=ss_X.fit_transform(X_train)
X_test=ss_X.transform(X_test)
y_train=ss_y.fit_transform(y_train.reshape(-1, 1))
y_test=ss_y.transform(y_test.reshape(-1, 1))

In [4]:
#使用线性回归模型SGDRegressor（随机梯度下降线性回归）对美国波士顿地区房价进行预测

#从sklearn.linear_model导入SGDRegressor
from sklearn.linear_model import SGDRegressor
#使用默认配置初始化线性回归器SGDRegressor
sgdr=SGDRegressor()
#使用训练数据进行参数估计
sgdr.fit(X_train,y_train)
#对测试数据进行回归预测
sgdr_y_predict=sgdr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [5]:
#从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absolute_error用于回归性能的评估
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

#使用r2_score模块，并输出评估结果
print("The R-squared value of SGDRegressor is",r2_score(y_test,sgdr_y_predict))

#使用mean_squared_error模块，并输出评估结果
print('The mean squared error of SGDRegressor is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))

#使用mean_absolute_error模块，并输出评估结果
print('The mean absolute error of SGDRegressor is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))

The R-squared value of SGDRegressor is 0.662098094672368
The mean squared error of SGDRegressor is 26.201351174270005
The mean absolute error of SGDRegressor is 3.501662383934226


In [6]:
#径向基核函数配置的支持向量机回归

#从sklearn.svm中导入支持向量机（回归）模型
from sklearn.svm import SVR

#使用径向基核函数配置的支持向量机进行回归训练，并且对测试样本进行预测
rbf_svr=SVR(kernel='rbf')
rbf_svr.fit(X_train,y_train)
rbf_svr_y_predict=rbf_svr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [7]:
#使用R-squared、MSE和MAE指标进行性能评估

print('R-squared value of RBF SVR is',rbf_svr.score(X_test,y_test))
print('The mean squared error of RBF SVR is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rbf_svr_y_predict)))
print('The mean absolute error of RBF SVR is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rbf_svr_y_predict)))

R-squared value of RBF SVR is 0.7559887416340947
The mean squared error of RBF SVR is 18.920948861538715
The mean absolute error of RBF SVR is 2.6067819999501114


In [8]:
#使用据距离加权回归的K近邻回归模型对美国波士顿房价数据进行回归预测

# 从sklearn.neighbors导入KNeighborRegressor(K近邻回归器)
from sklearn.neighbors import KNeighborsRegressor

# 初试化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归:weights='distance'
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_knr_y_predict = dis_knr.predict(X_test)

In [9]:
# 使用R-squared、MSE以及MAE三种指标对根据距离加权回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of distance-weighted KNeighorRegression:', dis_knr.score(X_test, y_test))
print('The mean squared error of distance-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)))
print('The mean absoluate error of distance-weighted KNeighorRegression:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)))

R-squared value of distance-weighted KNeighorRegression: 0.7201094821421603
The mean squared error of distance-weighted KNeighorRegression: 21.703073090490353
The mean absoluate error of distance-weighted KNeighorRegression: 2.801125502210876


In [10]:
#使用回归树对美国波士顿房价训练数据进行学习，并对测试数据进行预测

#从sklearn.tree中导入DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
#使用默认配置初始化DecisionTreeRegressor
dtr=DecisionTreeRegressor()
#用波士顿房价的训练数据构建回归树
dtr.fit(X_train,y_train)
#使用默认配置的单一回归树对测试数据进行预测，并将预测值存储在变量dtr_y_predict中
dtr_y_predict=dtr.predict(X_test)

In [11]:
#对单一回归树模型在美国波士顿房价测试数据上的预测性能进行评估
#使用R-squared、MSE以及MAE指标对默认配置的回归树在测试集上进行性能评估
print('R-squared value error of DecisionTreeRegressor:',dtr.score(X_test,y_test))
print('The mean squared error of DecisionTreeRegressor:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_predict)))
print('The mean absolute error of DecisionTreeRegressor:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_predict)))

R-squared value error of DecisionTreeRegressor: 0.6794098355181922
The mean squared error of DecisionTreeRegressor: 24.858976377952757
The mean absolute error of DecisionTreeRegressor: 3.2007874015748032


In [12]:
#pca降维处理

from sklearn.decomposition import PCA
#对数据进行标准化处理
X_std=StandardScaler().fit_transform(X)
#实例化PCA
pca = PCA(n_components = 3)
#训练数据
pca.fit(X_std)
#使用PCA的属性查看特征值
pca.singular_values_

array([55.6793095 , 26.93022859, 25.07516773])

In [13]:
#使用PCA的属性查看特征值对应的特征向量
pca.components_

array([[ 0.2509514 , -0.25631454,  0.34667207,  0.00504243,  0.34285231,
        -0.18924257,  0.3136706 , -0.32154387,  0.31979277,  0.33846915,
         0.20494226, -0.20297261,  0.30975984],
       [-0.31525237, -0.3233129 ,  0.11249291,  0.45482914,  0.21911553,
         0.14933154,  0.31197778, -0.34907   , -0.27152094, -0.23945365,
        -0.30589695,  0.23855944, -0.07432203],
       [ 0.24656649,  0.29585782, -0.01594592,  0.28978082,  0.12096411,
         0.59396117, -0.01767481, -0.04973627,  0.28725483,  0.22074447,
        -0.32344627, -0.3001459 , -0.26700025]])

In [36]:
#对原始的数据集进行转换
new_X = X.dot(pca.components_.T)
new_X

array([[ 38.89018107,  32.93532391, -51.87396066],
       [ 33.02343232,  54.79866941, -71.20799688],
       [ 26.53873512,  48.76840918, -67.85363879],
       ...,
       [ 49.08729488,  52.32240905, -64.55434463],
       [ 49.49001494,  50.78673003, -63.82014701],
       [ 46.64886906,  48.71888423, -65.55410705]])

In [15]:
#随机采样25%的数据构建测试样本，其余作为训练样本
X_train,X_test,y_train,y_test=train_test_split(new_X,y,random_state=33,test_size=0.25)
#分析回归目标值的差异
print("The max target value is",np.max(boston.target))
print("The min target value is",np.min(boston.target))
print("The average target value is",np.mean(boston.target))

The max target value is 50.0
The min target value is 5.0
The average target value is 22.532806324110677


In [16]:
#训练与测试数据标准化处理

#从sklearn.preprocessing导入数据标准化模块
from sklearn.preprocessing import StandardScaler

#分别初始化对特征和目标值的标准化器
ss_X=StandardScaler()
ss_y=StandardScaler()

#分别对训练和测试数据的特征以及目标值进行标准化处理
X_train=ss_X.fit_transform(X_train)
X_test=ss_X.transform(X_test)
y_train=ss_y.fit_transform(y_train.reshape(-1, 1))
y_test=ss_y.transform(y_test.reshape(-1, 1))

In [17]:
#使用线性回归模型SGDRegressor（随机梯度下降线性回归）对美国波士顿地区房价进行预测

#从sklearn.linear_model导入SGDRegressor
from sklearn.linear_model import SGDRegressor
#使用默认配置初始化线性回归器SGDRegressor
sgdr=SGDRegressor()
#使用训练数据进行参数估计
sgdr.fit(X_train,y_train)
#对测试数据进行回归预测
sgdr_y_predict=sgdr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [18]:
#从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absolute_error用于回归性能的评估
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

#使用r2_score模块，并输出评估结果
print("The R-squared value of SGDRegressor is",r2_score(y_test,sgdr_y_predict))

#使用mean_squared_error模块，并输出评估结果
print('The mean squared error of SGDRegressor is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))

#使用mean_absolute_error模块，并输出评估结果
print('The mean absolute error of SGDRegressor is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))

The R-squared value of SGDRegressor is 0.3799652737632211
The mean squared error of SGDRegressor is 48.078295346160374
The mean absolute error of SGDRegressor is 4.977813019570107


In [19]:
#径向基核函数配置的支持向量机回归

#从sklearn.svm中导入支持向量机（回归）模型
from sklearn.svm import SVR

#使用径向基核函数配置的支持向量机进行回归训练，并且对测试样本进行预测
rbf_svr=SVR(kernel='rbf')
rbf_svr.fit(X_train,y_train)
rbf_svr_y_predict=rbf_svr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [20]:
#使用R-squared、MSE和MAE指标进行性能评估

print('R-squared value of RBF SVR is',rbf_svr.score(X_test,y_test))
print('The mean squared error of RBF SVR is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rbf_svr_y_predict)))
print('The mean absolute error of RBF SVR is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rbf_svr_y_predict)))

R-squared value of RBF SVR is 0.42949220951690764
The mean squared error of RBF SVR is 44.237912632101356
The mean absolute error of RBF SVR is 4.416075229362109


In [21]:
#使用据距离加权回归的K近邻回归模型对美国波士顿房价数据进行回归预测

# 从sklearn.neighbors导入KNeighborRegressor(K近邻回归器)
from sklearn.neighbors import KNeighborsRegressor

# 初试化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归:weights='distance'
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_knr_y_predict = dis_knr.predict(X_test)

In [22]:
# 使用R-squared、MSE以及MAE三种指标对根据距离加权回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of distance-weighted KNeighorRegression:', dis_knr.score(X_test, y_test))
print('The mean squared error of distance-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)))
print('The mean absoluate error of distance-weighted KNeighorRegression:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)))

R-squared value of distance-weighted KNeighorRegression: 0.6087503556174598
The mean squared error of distance-weighted KNeighorRegression: 30.338003922574796
The mean absoluate error of distance-weighted KNeighorRegression: 3.813225414372116


In [23]:
#使用回归树对美国波士顿房价训练数据进行学习，并对测试数据进行预测

#从sklearn.tree中导入DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
#使用默认配置初始化DecisionTreeRegressor
dtr=DecisionTreeRegressor()
#用波士顿房价的训练数据构建回归树
dtr.fit(X_train,y_train)
#使用默认配置的单一回归树对测试数据进行预测，并将预测值存储在变量dtr_y_predict中
dtr_y_predict=dtr.predict(X_test)

In [24]:
#对单一回归树模型在美国波士顿房价测试数据上的预测性能进行评估
#使用R-squared、MSE以及MAE指标对默认配置的回归树在测试集上进行性能评估
print('R-squared value error of DecisionTreeRegressor:',dtr.score(X_test,y_test))
print('The mean squared error of DecisionTreeRegressor:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_predict)))
print('The mean absolute error of DecisionTreeRegressor:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_predict)))

R-squared value error of DecisionTreeRegressor: 0.09427076606071061
The mean squared error of DecisionTreeRegressor: 70.23141732283466
The mean absolute error of DecisionTreeRegressor: 5.629133858267716
