# 多項式回帰（特徴量：ALL）の学習→予測→評価


In [None]:
# # Colabでバージョンを変更するとき、コメントアウトして実行してください
# !pip install pandas==2.0.3
# !pip install numpy==1.25.2
# !pip install matplotlib==3.7.1
# !pip install scikit-learn==1.2.2

In [1]:
# ライブラリのインポート
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# バージョンの確認
import matplotlib
import sklearn
print(pd.__version__)
print(np.__version__)
print(matplotlib.__version__)
print(sklearn.__version__)

2.0.3
1.25.2
3.7.1
1.2.2


In [3]:
# データセットの読み込み
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')
df.columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [4]:
# 特徴量と目的変数の設定
X = df.drop(['MEDV'], axis=1)
y = df['MEDV']
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [5]:
# 学習データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=0)
print('X_trainの形状：', X_train.shape, ' y_trainの形状：', y_train.shape, ' X_testの形状：', X_test.shape, ' y_testの形状：', y_test.shape)

X_trainの形状： (404, 13)  y_trainの形状： (404,)  X_testの形状： (102, 13)  y_testの形状： (102,)


In [6]:
# 特徴量を2次多項式に変換
poly = PolynomialFeatures(degree=2, include_bias=False)

poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)
print(X_train_poly.shape)
print(X_test_poly.shape)
print(X_train_poly[0]) # 1件目の標準化された学習データの特徴量

(404, 104)
(102, 104)
[3.58090000e-01 0.00000000e+00 6.20000000e+00 1.00000000e+00
 5.07000000e-01 6.95100000e+00 8.85000000e+01 2.86170000e+00
 8.00000000e+00 3.07000000e+02 1.74000000e+01 3.91700000e+02
 9.71000000e+00 1.28228448e-01 0.00000000e+00 2.22015800e+00
 3.58090000e-01 1.81551630e-01 2.48908359e+00 3.16909650e+01
 1.02474615e+00 2.86472000e+00 1.09933630e+02 6.23076600e+00
 1.40263853e+02 3.47705390e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 3.84400000e+01 6.20000000e+00
 3.14340000e+00 4.30962000e+01 5.48700000e+02 1.77425400e+01
 4.96000000e+01 1.90340000e+03 1.07880000e+02 2.42854000e+03
 6.02020000e+01 1.00000000e+00 5.07000000e-01 6.95100000e+00
 8.85000000e+01 2.86170000e+00 8.00000000e+00 3.07000000e+02
 1.74000000e+01 3.91700000e+02 9.71000000e+00 2.57049000e-01
 3.52415700e+00 4.48695000e+01 1.45088190e+00 4.05600000e+00
 1

In [7]:
# 特徴量の標準化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() # 変換器の作成
scaler.fit(X_train_poly) # 学習データでの標準化パラメータの計算
X_train = scaler.transform(X_train_poly) # 学習データの変換
X_test = scaler.transform(X_test_poly) # テストデータの変換

print(X_train[0]) # 1件目の標準化された学習データの特徴量

[-0.37257438 -0.49960763 -0.70492455  3.66450153 -0.42487874  0.93567804
  0.69366877 -0.4372179  -0.16224243 -0.56165616 -0.48463784  0.3716906
 -0.41100022 -0.15828849 -0.40401763 -0.39164747  0.33171589 -0.38055514
 -0.35959919 -0.36424113 -0.42094632 -0.37099583 -0.37823372 -0.3728625
 -0.30087791 -0.34898538 -0.37113272 -0.5143492  -0.11385718 -0.51929827
 -0.49074131 -0.53669855 -0.44053652 -0.50213112 -0.4858122  -0.49819439
 -0.49972522 -0.50517267 -0.76007475  1.61685027 -0.7154799  -0.59913188
 -0.46469757 -0.99867195 -0.51405503 -0.71221238 -0.73736458 -0.51569491
 -0.66108295  3.66450153  3.14216742  3.85101113  4.22343713  2.99684973
  2.36571936  2.70949049  3.59687362  3.7339312   2.66589749 -0.47445708
  0.06394559  0.18922796 -0.615943   -0.29062094 -0.58350917 -0.56510422
  0.05179377 -0.48913402  0.90550258  1.06526813 -0.29984672 -0.04670497
 -0.36215209  0.34746246  0.7180842  -0.23877562  0.66519085  0.38721668
 -0.05791863 -0.14583958  0.43628907  0.90350607 -0.1

In [8]:
# モデルの学習
from sklearn.linear_model import LinearRegression

model = LinearRegression() # 線形回帰モデル
model.fit(X_train, y_train)
model.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [9]:
# テストデータの予測と評価
y_test_pred = model.predict(X_test)
print('RMSE test: %.2f' % (mean_squared_error(y_test, y_test_pred) ** 0.5))

RMSE test: 5.59


In [10]:
# テストデータの目的関数の統計情報
y_test.describe()

count    102.000000
mean      22.219608
std        9.068333
min        5.600000
25%       17.100000
50%       20.550000
75%       23.875000
max       50.000000
Name: MEDV, dtype: float64

In [11]:
# 正則化無しのパラメータ
print('回帰係数 w = [w1, w2, … , w104]:', model.coef_)
print('定数項 w0:', model.intercept_)

回帰係数 w = [w1, w2, … , w104]: [-1.73238190e+01  5.57338146e+00 -4.13882888e+01  4.23848940e+00
  4.11601424e+00  5.43841210e+00  3.14362235e+01 -2.49746252e+01
  1.26131503e+01  8.46984514e+00  8.70501997e+00 -1.02385225e+01
 -1.00437829e+01  7.49340538e-01  6.03265356e-01  9.30886117e+01
  1.03782088e+00 -1.63308876e+00  4.51534131e+00 -3.15456518e+00
  1.17984392e-01  1.17567940e+02 -2.51251952e+02  5.39960257e+01
 -1.11244473e-01  2.58276185e+00 -2.32841294e+00 -5.18666792e-01
 -3.54721300e-01 -1.65608607e+01  2.81941405e+00  3.35383652e-02
 -5.81334892e-01 -2.87674801e-01  4.78801041e+00 -5.16168764e+00
  1.34093216e+01 -6.33107807e-01  9.44367683e+00  6.90675278e-01
  7.27080180e-02  1.60511042e+01  5.19002722e+00  2.56668287e+00
 -1.27338170e+01  7.29286601e+00 -5.56744148e+00  1.32188945e+01
 -3.06558318e+00  4.23848940e+00 -5.26449706e+00 -1.01815811e+01
  5.65967555e-01  1.81819433e-01  1.96816854e+00 -2.07646183e+00
 -4.55665998e+00  1.14434016e+01 -1.30255041e+00 -1.45182921e

# Lasso回帰（特徴量：ALL）の学習→予測→評価

In [12]:
# モデルの学習
from sklearn.linear_model import Lasso
model2 = Lasso(alpha=0.1)

model2.fit(X_train, y_train)
model2.get_params()

{'alpha': 0.1,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [13]:
# 正則化有りのパラメータ
print('回帰係数 w = [w1, w2, … , w104]:', model2.coef_)
print('定数項 w0:', model2.intercept_)

回帰係数 w = [w1, w2, … , w104]: [-0.          0.          0.          0.         -0.          0.
  0.         -0.          0.          0.         -0.          0.
 -0.          0.          0.24633868 -0.          0.56721098 -0.26764504
 -0.71696315 -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.48958407 -0.46755873  0.11857147  0.
  0.00713714  0.          0.          0.         -0.          0.
  0.         -0.07437602  0.06232768 -0.         -0.         -0.
  0.         -0.11989191  0.43944794  0.          0.          0.
 -0.          0.         -0.          0.          0.          0.09828279
  0.          0.          0.          0.         -0.         -0.
 -1.21751701 -0.         -0.98619835  0.         -0.         -0.
  0.         -0.          5.87856207 -0.         -0.         -0.
 -0.         -1.92687309  0.         -5.22072694  0.         -0.
  0.          0.          0.          0.         -0.         -0.
  0.23376925 -0.66826934 -0.         -0.     

In [14]:
# テストデータの予測と評価
y_test_pred = model2.predict(X_test)
print('RMSE test: %.2f' % (mean_squared_error(y_test, y_test_pred) ** 0.5))

RMSE test: 4.89


# Ridge回帰（特徴量：ALL）の学習→予測→評価

In [15]:
# モデルの学習
from sklearn.linear_model import Ridge
model3 = Ridge(alpha=0.1)

model3.fit(X_train, y_train)
model3.get_params()

{'alpha': 0.1,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.0001}

In [16]:
# 正則化有りのパラメータ
print('回帰係数 w = [w1, w2, … , w104]:', model3.coef_)
print('定数項 w0:', model3.intercept_)

回帰係数 w = [w1, w2, … , w104]: [ -1.68109343  -1.94538393  -8.94528429   0.98419248   0.21929137
   7.37302181  10.95221396  -8.6689005    6.48196995   1.55641279
   3.81670682   3.98265108  -1.01830403   0.95154594   0.12541689
   5.17044851   1.37021892  -5.34345668   1.98985454  -3.32458645
   0.13379436  -0.37083529   0.97420037  -1.31428844  -0.0293938
   1.69627587   0.61197575  -0.02156588  -0.23756631  -5.00756689
   1.77608527  -0.26877594  -1.58296721   0.27572459   1.9287441
   1.22549082   2.94255716  -0.46869993   4.35477181   0.54112959
   5.17211013   2.51106484   4.22691405   0.86653326   4.76198536
   3.22062788  -4.90122941  -0.88873817  -5.32112529   0.98419248
  -4.02402261  -8.00368259   1.18283028   1.2194583    0.23982604
  -0.57690603  -1.03417738   9.50700527  -1.54981377  -0.88326248
   2.6282334   -1.66160449   3.02847231  -4.36528816  -0.05270825
  -6.33167779  -0.26124706   3.79708422   5.68126776  -7.95983824
   2.74886088  -2.58834094 -10.40283842  -8.29264

In [17]:
# テストデータの予測と評価
y_test_pred = model3.predict(X_test)
print('RMSE test: %.2f' % (mean_squared_error(y_test, y_test_pred) ** 0.5))

RMSE test: 5.20
