# 데이터 전처리 및 특성공학에 유용한 사이킷런 모듈

In [2]:
# 명령 결과 모두 보기
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# 버전 경고 무시
import warnings
warnings.filterwarnings('ignore')

# 판다스와 넘피
import pandas as pd
import numpy as np

# 그림 그리기
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['font.family']='Malgun Gothic' # 한글폰트

In [3]:
def summary(df) :
    tf = pd.DataFrame({'데이터형태(dtypes)' : df.dtypes,
                       '비 결측치 수(notnull': df.notnull().sum(),
                       '결측치 수(null)' : df.isnull().sum(),
                       '고유값 수(nunique)' : df.nunique()})
    return tf

# <font color='purple'> 1. sklearn.compose.ColumnTransformer

### Applies transformers to columns of an array or pandas DataFrame.

This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space. This is useful for heterogeneous or columnar data, to combine several feature extraction mechanisms or transformations into a single transformer.

- https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

<font color='brown'> ColumnTransformer 모듈은 () 안에 list of tuples 형태로 닉네임('str'), 자료변형 모듈(사이킷런 Estimator API), 해당컬럼 

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
ct = ColumnTransformer(
    [("norm1", Normalizer(norm='l1'), [0, 1]),
     ("norm2", Normalizer(norm='l1'), slice(2, 4))])

X = np.array([[0., 1., 2., 2.],
              [1., 1., 0., 1.]])
X

array([[0., 1., 2., 2.],
       [1., 1., 0., 1.]])

Normalizer scales each row of X to unit norm. A separate scaling is applied for the two first and two last elements of each row independently.

In [4]:
X[:, 0:2]
X[:, slice(2, 4)] # 2뻔째 부터 3번째(4-1)을 slice

array([[0., 1.],
       [1., 1.]])

array([[2., 2.],
       [0., 1.]])

In [5]:
ct.fit_transform(X)

array([[0. , 1. , 0.5, 0.5],
       [0.5, 0.5, 0. , 1. ]])

데이터프레임에도 적용 가능

In [6]:
df = pd.DataFrame(X)
df
ct.fit_transform(df)

Unnamed: 0,0,1,2,3
0,0.0,1.0,2.0,2.0
1,1.0,1.0,0.0,1.0


array([[0. , 1. , 0.5, 0.5],
       [0.5, 0.5, 0. , 1. ]])

In [7]:
df = pd.DataFrame(X, columns=['a','b','c','d'])
df
ct.fit_transform(df)

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,2.0
1,1.0,1.0,0.0,1.0


array([[0. , 1. , 0.5, 0.5],
       [0.5, 0.5, 0. , 1. ]])

In [8]:
ct = ColumnTransformer(
    [("norm1", Normalizer(norm='l1'), ['a', 'b']),
     ("norm2", Normalizer(norm='l1'), slice(2, 4))])

In [9]:
ct.fit_transform(df)

array([[0. , 1. , 0.5, 0.5],
       [0.5, 0.5, 0. , 1. ]])

## 예제 1. Column Transformer with Mixed Types
https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py

In [10]:
def summary(df) :
    tf = pd.DataFrame({'데이터형태(dtypes)' : df.dtypes,
                       '비 결측치 수(notnull': df.notnull().sum(),
                       '결측치 수(null)' : df.isnull().sum(),
                       '고유값 수(nunique)' : df.nunique()})
    
    
    return tf

In [11]:
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

In [12]:
X.head()
X.shape

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


(1309, 13)

In [13]:
X = X.drop(['cabin', 'embarked', 'boat', 'body', 'home.dest'], axis=1)
summary(X)

Unnamed: 0,데이터형태(dtypes),비 결측치 수(notnull,결측치 수(null),고유값 수(nunique)
pclass,float64,1309,0,3
name,object,1309,0,1307
sex,category,1309,0,2
age,float64,1046,263,98
sibsp,float64,1309,0,7
parch,float64,1309,0,8
ticket,object,1309,0,929
fare,float64,1308,1,281


### Use ColumnTransformer by selecting column by names
We will train our classifier with the following features:

- Numeric Features:
     * age: float;
     * fare: float.
     
     
- Categorical Features:
     * sex: categories encoded as strings {'female', 'male'};
     * pclass: ordinal integers {1, 2, 3}.

We create the preprocessing pipelines for both numeric and categorical data. Note that pclass could either be treated as a categorical or numeric feature.

In [14]:
from sklearn import set_config
set_config(display='diagram')

In [19]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
numeric_transformer

In [20]:
categorical_features = ['sex', 'pclass']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_transformer

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

'''preprocessor = ColumnTransformer(
        [('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
        '''
preprocessor

"preprocessor = ColumnTransformer(\n        [('num', numeric_transformer, numeric_features),\n        ('cat', categorical_transformer, categorical_features)])\n        "

In [22]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])
clf

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

In [24]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.798


### Using the prediction pipeline in a grid search
<font color='red'> 연쇄 닉네임에 '__' 두줄로 연결하여야함

In [29]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],   # 연쇄 닉네임에 '__' 두줄로 연결하여야함
    'classifier_C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

In [32]:
grid_search.fit(X_train, y_train)

print(f"Best params:")
print(grid_search.best_params_)

Best params:
{'classifier__C': 1.0, 'preprocessor__num__imputer__strategy': 'mean'}


In [33]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.787


In [34]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[["mean_test_score", "std_test_score",
            "param_preprocessor__num__imputer__strategy",
            "param_classifier__C"
            ]].head(5)

Unnamed: 0,mean_test_score,std_test_score,param_preprocessor__num__imputer__strategy,param_classifier__C
2,0.787051,0.038639,mean,1.0
3,0.787051,0.038639,median,1.0
5,0.78609,0.037744,median,10.0
7,0.78609,0.037744,median,100.0
4,0.785137,0.039387,mean,10.0


## 예제 2. ColumnTransformer for heterogeneous data
https://scikit-learn.org/stable/modules/compose.html#columntransformer-for-heterogeneous-data

Many datasets contain features of different types, say text, floats, and dates, where each type of feature requires separate preprocessing or feature extraction steps. Often it is easiest to preprocess data before applying scikit-learn methods, for example using pandas. Processing your data before passing it to scikit-learn might be problematic for one of the following reasons:
Incorporating statistics from test data into the preprocessors makes cross-validation scores unreliable (known as data leakage), for example in the case of scalers or imputing missing values.
You may want to include the parameters of the preprocessors in a parameter search.

<font color='blue'> The ColumnTransformer helps performing different transformations for different columns of the data, within a Pipeline that is safe from data leakage and that can be parametrized. ColumnTransformer works on arrays, sparse matrices, and pandas DataFrames.
To each column, a different transformation can be applied, such as preprocessing or a specific feature extraction method:
    
    
https://scikit-learn.org/stable/modules/compose.html#columntransformer-for-heterogeneous-data

In [35]:
X = pd.DataFrame(
    {'city': ['London', 'London', 'Paris', 'Sallisaw'],
     'title': ["His Last Bow", "How Watson Learned the Trick",
               "A Moveable Feast", "The Grapes of Wrath"],
     'expert_rating': [5, 3, 4, 5],
     'user_rating': [4, 5, 4, 3]})
X

Unnamed: 0,city,title,expert_rating,user_rating
0,London,His Last Bow,5,4
1,London,How Watson Learned the Trick,3,5
2,Paris,A Moveable Feast,4,4
3,Sallisaw,The Grapes of Wrath,5,3


For this data, we might want to encode the 'city' column as a categorical variable using OneHotEncoder but apply a CountVectorizer to the 'title' column. As we might use multiple feature extraction methods on the same column, we give each transformer a unique name, say 'city_category' and 'title_bow'. By default, the remaining rating columns are ignored (remainder='drop'):

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
column_trans = ColumnTransformer(
    [('city_category', OneHotEncoder(dtype='int'),['city']),
     ('title_bow', CountVectorizer(), 'title')],
    
    remainder='drop')
column_trans

In [38]:
column_trans.fit(X)

In [39]:
column_trans.get_feature_names()

['city_category__x0_London',
 'city_category__x0_Paris',
 'city_category__x0_Sallisaw',
 'title_bow__bow',
 'title_bow__feast',
 'title_bow__grapes',
 'title_bow__his',
 'title_bow__how',
 'title_bow__last',
 'title_bow__learned',
 'title_bow__moveable',
 'title_bow__of',
 'title_bow__the',
 'title_bow__trick',
 'title_bow__watson',
 'title_bow__wrath']

In [40]:
column_trans.transform(X).toarray()

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]], dtype=int64)

In [41]:
pd.DataFrame(column_trans.transform(X).toarray(), columns = column_trans.get_feature_names())

Unnamed: 0,city_category__x0_London,city_category__x0_Paris,city_category__x0_Sallisaw,title_bow__bow,title_bow__feast,title_bow__grapes,title_bow__his,title_bow__how,title_bow__last,title_bow__learned,title_bow__moveable,title_bow__of,title_bow__the,title_bow__trick,title_bow__watson,title_bow__wrath
0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,1,0,0,1,1,1,0
2,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1


Apart from a scalar or a single item list, the column selection can be specified as a list of multiple items, an integer array, a slice, a boolean mask, or with a make_column_selector. The make_column_selector is used to select columns based on data type or column name:

In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
ct = ColumnTransformer([
      ('scale', StandardScaler(),
      make_column_selector(dtype_include=np.number)),
    
      ('onehot',
      OneHotEncoder(),
      make_column_selector(pattern='city', dtype_include=object))])
ct

In [44]:
ct.fit_transform(X)

array([[ 0.90453403,  0.        ,  1.        ,  0.        ,  0.        ],
       [-1.50755672,  1.41421356,  1.        ,  0.        ,  0.        ],
       [-0.30151134,  0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.90453403, -1.41421356,  0.        ,  0.        ,  1.        ]])

We can keep the remaining rating columns by setting remainder='passthrough'. The values are appended to the end of the transformation:

In [45]:
column_trans = ColumnTransformer(
    [('city_category', OneHotEncoder(dtype='int'),['city']),
     ('title_bow', CountVectorizer(), 'title')],
    remainder='passthrough')
column_trans

In [46]:
X
column_trans.fit_transform(X)

Unnamed: 0,city,title,expert_rating,user_rating
0,London,His Last Bow,5,4
1,London,How Watson Learned the Trick,3,5
2,Paris,A Moveable Feast,4,4
3,Sallisaw,The Grapes of Wrath,5,3


array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 4],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 3, 5],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 4],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 5, 3]],
      dtype=int64)

In [47]:
from sklearn.preprocessing import MinMaxScaler
column_trans = ColumnTransformer(
    [('city_category', OneHotEncoder(), ['city']),
     ('title_bow', CountVectorizer(), 'title')],
    remainder=MinMaxScaler())

column_trans.fit_transform(X)[:, -2:]

array([[1. , 0.5],
       [0. , 1. ],
       [0.5, 0.5],
       [1. , 0. ]])

# <font color='purple'> sklearn.compose.make_column_transformer
### Construct a ColumnTransformer from the given transformers. This is a shorthand for the ColumnTransformer constructor; 
    - It does not require, and does not permit, naming the transformers. Instead, they will be given names automatically based on their types
The make_column_transformer function is available to more easily create a ColumnTransformer object. Specifically, the names will be given automatically. The equivalent for the above example would be:
    
    
https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html#sklearn.compose.make_column_transformer

In [48]:
from sklearn.compose import make_column_transformer
column_trans = make_column_transformer(
    (OneHotEncoder(), ['city']),
    (CountVectorizer(), 'title'),
    remainder=MinMaxScaler())
column_trans

# End