<a href="https://colab.research.google.com/github/applejxd/colaboratory/blob/master/LinearModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# sklearn.linear_model で回帰タスクをするサンプル
- [ボストンの住宅価格に関する回帰タスク](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html)
- [説明変数をDataFrame形式で用意](https://scikit-learn.org/stable/datasets/index.html#boston-dataset)

## 前準備

説明変数を DataFrame で用意

In [None]:
import pandas as pd
from sklearn.datasets import load_boston

data = load_boston()
train_x: pd.DataFrame = pd.DataFrame(data.data, columns = data.feature_names)

print(len(train_x))
display(train_x.head())

506


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


前処理で正規化

In [None]:
from sklearn.preprocessing import StandardScaler

# 標準化を定義
scaler = StandardScaler()
scaler.fit(train_x)

# 変換後のデータで各列を置換
train_x = pd.DataFrame(scaler.transform(train_x), 
                       columns=train_x.columns.values)
display(train_x.head())

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


目的変数も DataFrame で用意

In [None]:
train_y: pd.DataFrame = pd.DataFrame(data.target, columns = ["MEDV"])
display(train_y.head())

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


学習を行ってモデルを戻す関数を定義

In [None]:
from typing import Dict
import sklearn.linear_model as lm

def Fit(tr_x: pd.DataFrame, tr_y: pd.Series,
        va_x: pd.DataFrame, va_y: pd.Series,
        params: Dict = {}):
    """
    :param tr_x: 学習用説明変数
    :param tr_y: 学習用目的変数
    :param va_x: バリデーション用説明変数
    :param va_y: バリデーション用目的変数
    :return: 学習済みモデル
    """

    # 学習の実施
    model = lm.Ridge(alpha=0.5).fit(tr_x, tr_y)
    return model

クロスバリデーションの準備

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True, random_state=71)

## 学習の実施

学習の実施

In [None]:
# 学習・バリデーションデータの分離
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

# 学習
model = Fit(tr_x, tr_y, va_x, va_y)

バリデーションの実施

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error

va_pred: np.ndarray = model.predict(va_x)
score: float = mean_absolute_error(va_y, va_pred)
print(f'MAE: {score:.4f}')

MAE: 3.7505
