# Custom Transformer 구축
- 적용 및 검증
- pipeline 구축 및 cross validation 사용

In [30]:
import pandas as pd
import numpy as np
from IPython.display import display
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.datasets import load_iris
iris = load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

In [31]:
row_null = np.random.randint(0, 150, 30)
col_null = np.random.randint(0, 4, 30)

for x,y in zip(row_null, col_null):
    df.iloc[x, y] = np.nan
df.isnull().sum()

sepal length (cm)     7
sepal width (cm)     10
petal length (cm)     3
petal width (cm)     10
target                0
dtype: int64

In [85]:
# custom transformer define
from sklearn.base import BaseEstimator, TransformerMixin

class fillna_median(BaseEstimator, TransformerMixin):
    def __init__(self, continuous_cols, categorical_cols):
        self.continuous_cols = continuous_cols
        self.categorical_cols = categorical_cols
    
    def fit(self, X, y = None):
        self.median = X[self.continuous_cols].median()
        self.mode = X[self.categorical_cols].mode().iloc[0]
        # fit_transform 메서드를 위해 필요
        return self
    
    def transform(self, X_test):
        X_transformed = X_test.copy()
        X_transformed[self.continuous_cols] = X_transformed[self.continuous_cols].fillna(self.median)
        X_transformed[self.categorical_cols] = X_transformed[self.categorical_cols].fillna(self.mode)
        return X_transformed

In [92]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.3, random_state=42)
null_dist = pd.DataFrame()
null_dist['train'] = X_train.isnull().sum()
null_dist['test'] = X_test.isnull().sum()
display(null_dist)

Unnamed: 0,train,test
sepal length (cm),5,2
sepal width (cm),7,3
petal length (cm),2,1
petal width (cm),9,1


In [93]:
null_idx = np.where(X_test.isnull())
null_idx

(array([12, 16, 23, 30, 30, 30, 36]), array([1, 1, 0, 0, 2, 3, 1]))

In [94]:
fill_val = pd.DataFrame()
fill_val['median'] = X_train.median()
fill_val['mode'] = X_train.mode().iloc[0]
fill_val

Unnamed: 0,median,mode
sepal length (cm),5.8,5.1
sepal width (cm),3.0,3.0
petal length (cm),4.4,1.4
petal width (cm),1.3,0.2


In [95]:
myTransform = fillna_median(continuous_cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)'],
                            categorical_cols = ['petal width (cm)'])

In [96]:
X_train = myTransform.fit_transform(X_train)
X_train.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [98]:
# 실제로 원하는대로 값이 채워졌는지 확인
X_test_transformed = myTransform.transform(X_test)

display(fill_val)

for x,y in zip(null_idx[0], null_idx[1]):
    print(f'idx : ({x},{y}), {df.columns[y]} : {X_test_transformed.iloc[x,y]}')


Unnamed: 0,median,mode
sepal length (cm),5.8,5.1
sepal width (cm),3.0,3.0
petal length (cm),4.4,1.4
petal width (cm),1.3,0.2


idx : (12,1), sepal width (cm) : 3.0
idx : (16,1), sepal width (cm) : 3.0
idx : (23,0), sepal length (cm) : 5.8
idx : (30,0), sepal length (cm) : 5.8
idx : (30,2), petal length (cm) : 4.4
idx : (30,3), petal width (cm) : 0.2
idx : (36,1), sepal width (cm) : 3.0


In [105]:
# pipeline을 이용한 전처리
# cross-validation을 이용한 모델 성능 평가

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

pipe = Pipeline([('fillna',fillna_median(continuous_cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)'],
                            categorical_cols = ['petal width (cm)'])),
                  ('lr',LogisticRegression())])
cross_val_score(pipe, df.drop('target',axis=1), df['target'], cv=5, scoring='accuracy')

array([0.96666667, 1.        , 0.86666667, 0.96666667, 1.        ])