# 自作関数をFunctionTransformerで変換器にする

In [24]:
# 基本的なモジュール
import numpy as np
import pandas as pd
# データ分割用の関数
from sklearn.model_selection import train_test_split
# 評価指標
from sklearn.metrics import r2_score
# サンプルデータ
from sklearn.datasets import fetch_california_housing
# パイプライン構築のための道具
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# 今回、変換器として利用
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
# 今回、推定器として利用
import xgboost as xgb

## シンプルな関数

In [11]:
def custom_func(X):
    return(X*2)

In [12]:
X = np.array([0,1,2,3,4])
print(custom_func(X))

[0 2 4 6 8]


In [13]:
cft = FunctionTransformer(custom_func)
X_trans = cft.transform(X)
print(X_trans)

[0 2 4 6 8]


# パラメータ付きの関数

In [14]:
def custom_func_arg(X, arg):
    return(X*arg)

In [15]:
X = np.array([0,1,2,3,4])
arg = 2
print(custom_func_arg(X,arg))

X = np.array([0,1,2,3,4])
arg = 3
print(custom_func_arg(X,arg))

[0 2 4 6 8]
[ 0  3  6  9 12]


In [16]:
cft = FunctionTransformer(custom_func_arg, kw_args={'arg': 2})
X_trans = cft.transform(X)
print(X_trans)

[0 2 4 6 8]


In [17]:
cft.set_params(kw_args={'arg': 3})

X_trans = cft.transform(X)
print(X_trans)

[ 0  3  6  9 12]


In [18]:
cft.get_params()

{'accept_sparse': False,
 'check_inverse': True,
 'feature_names_out': None,
 'func': <function __main__.custom_func_arg(X, arg)>,
 'inv_kw_args': None,
 'inverse_func': None,
 'kw_args': {'arg': 3},
 'validate': False}

## 逆変換付きの変換器

In [19]:
# カスタム関数
def custom_func_arg(X, arg):
    return(X*arg)
# 逆関数
def custom_func_arg_inv(X, arg):
    return(X/arg)

In [20]:
cft = FunctionTransformer(
    custom_func_arg,         #カスタム関数
    custom_func_arg_inv,     #逆関数
    kw_args={'arg': 3},      #カスタム関数のパラメータ
    inv_kw_args = {'arg': 3} #逆関数のパラメータ
)

In [21]:
X_trans = cft.transform(X)
print(X_trans)

[ 0  3  6  9 12]


In [22]:
X_trans_inv = cft.inverse_transform(X_trans)
print(X_trans_inv)

[0. 1. 2. 3. 4.]


In [23]:
cft.get_params()

{'accept_sparse': False,
 'check_inverse': True,
 'feature_names_out': None,
 'func': <function __main__.custom_func_arg(X, arg)>,
 'inv_kw_args': {'arg': 3},
 'inverse_func': <function __main__.custom_func_arg_inv(X, arg)>,
 'kw_args': {'arg': 3},
 'validate': False}

## FunctionTransfomerを使ったパイプラインの構築

- MedInc - 収入の中央値
- HouseAge - 築年数
- AveRoom - 部屋数の平均値
- AveBedrms - 寝室の平均値
- Population - 人口
- AveOccup - 平均入居率
- Latitude - 緯度
- Longitude - 経度

In [28]:
# データセットの読み込み
california_housing = fetch_california_housing(as_frame=True)

X = california_housing.data
y = california_housing.target

display(X.head(3))
display(y.head(3))

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24


0    4.526
1    3.585
2    3.521
Name: MedHouseVal, dtype: float64

In [29]:
# 学習データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    random_state=123
)

In [38]:
feature1 = X.columns.values[:-2] #後ろから2番目までを除外
feature2 = X.columns.values[-2:] #後ろから2番目まで（'Latitude', 'Longitude'）

print(feature1)
print(feature2)

['MedInc' 'HouseAge' 'AveRooms' 'AveBedrms' 'Population' 'AveOccup']
['Latitude' 'Longitude']


In [42]:
# カスタム関数
def custom_func(X):
    return(np.log1p(X))

In [43]:
# カスタム変換器
cft = FunctionTransformer(custom_func)

# 変換器パイプラインの定義
log_trans = ColumnTransformer(
    transformers=[("cft", cft, feature1)],
    remainder = 'passthrough', # # 処理の対象外の特徴量に対する処理：何もしない
)

# パイプラインの定義
num_pipeline = Pipeline(
    steps=[
        ("log_trans", log_trans),
        ("regressor", xgb.XGBRegressor()),
    ]
)

In [44]:
# パイプラインの学習
num_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [45]:
# 目的変数yの予測
pred_y = num_pipeline.predict(X_test)

# R2(決定係数)
r2_score(y_test, pred_y)

0.8381599506635184