# Boston Housing

In [1]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
ds = datasets.load_boston()
print(ds.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
import pandas as pd
X=pd.DataFrame(ds.data, columns=ds.feature_names)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
y =ds.target
X.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((379, 13), (127, 13), (379,), (127,))

In [7]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
X_train = scaler.transform(X_train) #transform 轉換

In [9]:
X_test = scaler.transform(X_test)

In [10]:
lr = LinearRegression(normalize=True)

In [11]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [12]:
lr.intercept_

22.457519788918248

In [13]:
lr.coef_

array([-0.71826994,  1.11895458,  0.20948069,  0.87221605, -2.44487824,
        2.24290763,  0.16646051, -3.2930857 ,  2.67745005, -2.00906082,
       -2.18534015,  0.71104233, -4.01297435])

In [14]:
y_pred = lr.predict(X_test)

In [15]:
lr.score(X_test, y_test)

0.8112643498911201

In [16]:
import numpy as np
np.argsort(abs(lr.coef_))

array([ 6,  2, 11,  0,  3,  1,  9, 10,  5,  4,  8,  7, 12], dtype=int64)

In [17]:
y_pred = lr.predict(X_test[0:3])
y_pred

array([20.53660952, 27.05957048, 24.14121233])

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mean_squared_error(y_test, lr.predict(X_test))

15.533613337409037

In [19]:
r2_score(y_test, lr.predict(X_test))

0.8112643498911201