# 単純なパイプラインの構築をやってみる

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

import xgboost as xgb

- datetime – 日時
- season 
    - 1 = 春
    - 2 = 夏
    - 3 = 秋
    - 4 = 冬
- holiday – その日が休日であるかどうか
- workingday – その日が週末でも休日でもない日かどうか
- weather 
    - 1：晴れ、雲少ない、部分的に曇り、部分的に曇り
    - 2: 霧＋曇り、霧＋切れ落ちた雲、霧＋少ない雲、霧
    - 3：小雪、小雨＋雷雨＋雲が散らばる、小雨＋雲が散らばる
    - 4：大雨＋氷柱＋雷雨＋霧、雪＋霧
- temp – 気温（摂氏）。
- atemp – 体感温度
- humidity – 相対湿度
- windspeed – 風の速さ
- casual – 非登録ユーザーによるレンタル開始数
- registered – 登録ユーザーによるレンタル開始数
- count – 総レンタル数

In [4]:
# データセットの読み込み
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'
df = pd.read_csv(url, index_col='datetime', parse_dates=True)
X = df.drop(['casual','registered','count'],axis=1)
y = df['casual']

display(X.head(3))
display(y.head(3))

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0


datetime
2011-01-01 00:00:00    3
2011-01-01 01:00:00    8
2011-01-01 02:00:00    5
Name: casual, dtype: int64

In [5]:
# 量的変数
nums = ['temp','atemp','humidity','windspeed']
# 質的変数
cats = ['season','holiday','workingday','weather']

In [10]:
display(X[nums].head(3))
display(X[cats].head(3))

Unnamed: 0_level_0,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-01 00:00:00,9.84,14.395,81,0.0
2011-01-01 01:00:00,9.02,13.635,80,0.0
2011-01-01 02:00:00,9.02,13.635,80,0.0


Unnamed: 0_level_0,season,holiday,workingday,weather
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-01 00:00:00,1,0,0,1
2011-01-01 01:00:00,1,0,0,1
2011-01-01 02:00:00,1,0,0,1


In [11]:
# 学習データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    random_state=123
)

## 量的変数のみ

### make_pipelineで構築するパターン

In [17]:
# パイプラインの定義
num_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(), 
    xgb.XGBRegressor(),
)
# パイプラインの学習
num_pipeline.fit(X_train[nums], y_train)

In [18]:
# 目的変数yの予測
pred_y = num_pipeline.predict(X_test[nums])

# R2(決定係数)
r2_score(y_test, pred_y)

0.32076388597488403

### Pipelineで構築するパターン

In [20]:
# パイプラインの定義
num_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy='mean')),
        ("scale", StandardScaler()),
        ("regressor", xgb.XGBRegressor()),
    ]
)

# パイプラインの学習
num_pipeline.fit(X_train[nums], y_train)

In [21]:
# 目的変数yの予測
pred_y = num_pipeline.predict(X_test[nums])

# R2(決定係数)
r2_score(y_test, pred_y)

0.32076388597488403

## 質的変数のみ

### make_pipelineで構築するパターン

In [24]:
# パイプラインの定義
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown='ignore'),
    xgb.XGBRegressor(),
)
# パイプラインの学習
cat_pipeline.fit(X_train[cats], y_train)

In [25]:
# 目的変数yの予測
pred_y = cat_pipeline.predict(X_test[cats])

# R2(決定係数)
r2_score(y_test, pred_y)

0.21600550413131714

### Pipelineで構築するパターン

In [26]:
# パイプラインの定義
cat_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OneHotEncoder(handle_unknown='ignore')),
        ("regressor", xgb.XGBRegressor()),
    ]
)
# パイプラインの学習
cat_pipeline.fit(X_train[cats], y_train)

In [27]:
# 目的変数yの予測
pred_y = cat_pipeline.predict(X_test[cats])

# R2(決定係数)
r2_score(y_test, pred_y)

0.21600550413131714

## 量的変数と質的変数

In [30]:
# 量的変数用の変換器パイプラインの定義
num_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy='mean')),
        ("scale", StandardScaler()),
    ]
)
# 質的変数用の変換器パイプラインの定義
cat_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OneHotEncoder(handle_unknown='ignore')),
    ]
)

# 変換器パイプラインの定義
trans = ColumnTransformer(
    transformers=[
        ("num_trans", num_pipeline, nums),
        ("cat_trans", cat_pipeline, cats),
    ],
    remainder = 'drop', # 処理の対象外の特徴量に対する処理
)

In [39]:
trans.fit(X_train, y_train)

In [35]:
# 変換器を使用
X_transformed = trans.fit_transform(X_train)

print(X_transformed.shape)
print(X_transformed)

(7620, 15)
[[ 0.57160903  0.79624767 -0.11200374 ...  0.          1.
   0.        ]
 [ 0.25560742  0.25966023  0.5664706  ...  1.          0.
   0.        ]
 [-1.008399   -1.17182991 -0.52952642 ...  0.          1.
   0.        ]
 ...
 [-1.21906674 -0.99296743 -1.41676209 ...  1.          0.
   0.        ]
 [-1.008399   -0.90324104 -0.05981341 ...  1.          0.
   0.        ]
 [ 0.78227677  0.61738519  1.08837394 ...  1.          0.
   0.        ]]


In [36]:
# 推定器と連結
full_pipeline = Pipeline(
    steps=[
        ("preprocessor", trans),
        ("regressor", xgb.XGBRegressor()),
    ]
)
# パイプラインの学習
full_pipeline.fit(X_train, y_train)

In [42]:
# 目的変数yの予測
pred_y = full_pipeline.predict(X_test)

# R2(決定係数)
r2_score(y_test, pred_y)

0.6472477912902832

### ハイパーパラメータの探索

In [43]:
# 探索範囲
params = {
    'regressor__max_depth':[2, 4, 6, 8, 10],
    'regressor__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90],
    'regressor__min_child_weight':[1, 2, 4, 6, 8, 10],
}

# インスタンス生成
gs = GridSearchCV(
    full_pipeline,
    params,
    cv=10,
    n_jobs=-1,
)
# グリッドサーチの実施
gs.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [44]:
# 最適なパラメーター
print(gs.best_params_)

{'regressor__max_depth': 6, 'regressor__min_child_weight': 6, 'regressor__n_estimators': 50}


In [45]:
# 最適なモデル
best_pipeline = gs.best_estimator_

# 目的変数yの予測
pred_y = best_pipeline.predict(X_test)

# R2(決定係数)
r2_score(y_test, pred_y)

0.648542046546936