In [0]:
# ライブラリのインポート
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [0]:
# 住宅価格データセットの読み込み
df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')

df.columns=['CRIM','ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

# データフレームの形状
print('dfの形状', df.shape)

dfの形状 (506, 14)


In [0]:
# 全ての特徴量を選択
X=df.iloc[:, 0:13].values
# 正解に住宅価格(MDEV)を設定
y = df['MEDV'].values

# 特徴量と正解を訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('X_trainの形状：',X_train.shape,' y_trainの形状：',y_train.shape,' X_testの形状：',X_test.shape,' y_testの形状：',y_test.shape)

X_trainの形状： (404, 13)  y_trainの形状： (404,)  X_testの形状： (102, 13)  y_testの形状： (102,)


In [0]:
# 特徴量を2次多項式に変換
POLY = PolynomialFeatures(degree=2, include_bias = False)

X_train_pol = POLY.fit_transform(X_train)
X_test_pol = POLY.transform(X_test)
X_train_pol.shape, X_test_pol.shape

((404, 104), (102, 104))

In [0]:
# 特徴量の標準化
sc = StandardScaler()
# 訓練データを変換器で標準化
X_train_std = sc.fit_transform(X_train_pol)
# テストデータを作成した変換器で標準化
X_test_std = sc.transform(X_test_pol)

# 標準化された訓練データ
X_train_std[0]

array([-0.37257438, -0.49960763, -0.70492455,  3.66450153, -0.42487874,
        0.93567804,  0.69366877, -0.4372179 , -0.16224243, -0.56165616,
       -0.48463784,  0.3716906 , -0.41100022, -0.15828849, -0.40401763,
       -0.39164747,  0.33171589, -0.38055514, -0.35959919, -0.36424113,
       -0.42094632, -0.37099583, -0.37823372, -0.3728625 , -0.30087791,
       -0.34898538, -0.37113272, -0.5143492 , -0.11385718, -0.51929827,
       -0.49074131, -0.53669855, -0.44053652, -0.50213112, -0.4858122 ,
       -0.49819439, -0.49972522, -0.50517267, -0.76007475,  1.61685027,
       -0.7154799 , -0.59913188, -0.46469757, -0.99867195, -0.51405503,
       -0.71221238, -0.73736458, -0.51569491, -0.66108295,  3.66450153,
        3.14216742,  3.85101113,  4.22343713,  2.99684973,  2.36571936,
        2.70949049,  3.59687362,  3.7339312 ,  2.66589749, -0.47445708,
        0.06394559,  0.18922796, -0.615943  , -0.29062094, -0.58350917,
       -0.56510422,  0.05179377, -0.48913402,  0.90550258,  1.06

In [0]:
# 線形回帰モデルを作成
model = LinearRegression()

# モデルの訓練
model.fit(X_train_std, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
# MSEの計算
y_train_pred = model.predict(X_train_std)
y_test_pred = model.predict(X_test_std)

print('MSE train: %.2f, test: %.2f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))

MSE train: 4.34, test: 31.28
