## 데이터 전처리
- Label encoding(레이블 인코딩)

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [3]:
items = ['TV','냉장고', '전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [5]:
# LabelEncoder 객체 생성
encoder = LabelEncoder()
# fit : 식에 맞는 값을 찾기 => 유니크 처리, 정렬 => 인덱스 찾기
encoder.fit(items)
# 변환 : transform (문자열을 숫자로 변환)
labels = encoder.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2], dtype=int64)

In [10]:
# 유니크 처리한 결과 확인
encoder.classes_

array(['TV', '냉장고', '믹서', '선풍기', '전자레인지', '컴퓨터'], dtype='<U5')

In [11]:
# 숫자로 인코딩 되어 있는 객체를 문자열로 다시 변환하기
encoder.inverse_transform(labels)

array(['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서'], dtype='<U5')

In [15]:
encoder.inverse_transform([3,3,0,3,2,1])

array(['선풍기', '선풍기', 'TV', '선풍기', '믹서', '냉장고'], dtype='<U5')

- 원-핫 인코딩(one-hot encoding) 

In [25]:
from sklearn.preprocessing import OneHotEncoder

items = ['TV','냉장고', '전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

# 먼저 숫자값으로 변환을 위해 Label 인코딩이 선행되어야 한다.
encoder = LabelEncoder()
encoder.fit(items)  # 유니크 처리 후 정렬
labels = encoder.transform(items)  # 숫자로 변환
# 현재 1차원인 데이터를 2차원으로 변환필요
labels = labels.reshape(-1,1)

In [28]:
# 원-핫 인코딩을 적용
oh_encoder = OneHotEncoder()   # 객체 생성

In [29]:
# 2차원으로 변환하는 이유. one-hot encoder의 fit메소드의 입력 파라미터 형식이 array-like 이기 때문.
oh_encoder.fit?

In [30]:
oh_encoder.fit(labels) # 변환하기 위한 식만 생성된 상태

OneHotEncoder()

In [31]:
# 변환하기
oh_labels = oh_encoder.transform(labels)
oh_labels

<8x6 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [32]:
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [33]:
oh_labels.shape

(8, 6)

In [35]:
# DataFrame 에 편하게 원핫 인코딩해주는 메소드 : get_dummies()
df = pd.DataFrame({
    'items' : ['TV','냉장고', '전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']
})
df

Unnamed: 0,items
0,TV
1,냉장고
2,전자레인지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [36]:
pd.get_dummies(df)

Unnamed: 0,items_TV,items_냉장고,items_믹서,items_선풍기,items_전자레인지,items_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


### 피처 스케일링과 정규화
- StandardScaler => Standardization

In [38]:
from sklearn.datasets import load_iris
import pandas as pd

# 붓꽃 데이터셋을 로딩하고 DataFrame으로 변환
iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(
     data =iris_data
    , columns=iris.feature_names
)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [39]:
# 컬럼 당 평균
iris_df.mean()

sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64

In [40]:
# 분산 처리
iris_df.var()  # 퍼져있는 정도 확인

sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64

In [42]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()
scaler.fit(iris_df) # 변환에 필요한 값을 구해야 한다. (평균, 표준편차)
# 변환
iris_scaled = scaler.transform(iris_df)
type(iris_scaled)

numpy.ndarray

In [46]:
iris_df_scaled = pd.DataFrame(
    data=iris_scaled
    ,columns=iris.feature_names
)
iris_df_scaled

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [47]:
iris_df_scaled.mean()      # scale된 애들의 평균은 0에 가까워진다.

sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64

In [48]:
iris_df_scaled.var()     # scale된 애들의 분산은 1에 가까워진다.

sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64

- MinMaxScaler  => Normalization

In [49]:
# MinMaxScaler : 변량을 0부터 1사이 값으로 변환(Normalization)
from sklearn.preprocessing import MinMaxScaler

# 객체 생성
scaler = MinMaxScaler()

# min, max value 구해야 한다 => fit()
scaler.fit(iris_df)

MinMaxScaler()

In [50]:
# 변환
iris_scaled = scaler.transform(iris_df)
type(iris_scaled)

numpy.ndarray

In [51]:
# dataframe으로 만들기
iris_df_scaled = pd.DataFrame(
    data=iris_scaled
    ,columns=iris.feature_names
)
iris_df_scaled.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


In [52]:
iris_df_scaled.min()  #컬럼별 최소값

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64

In [53]:
iris_df_scaled.max()  #컬럼별 최대값

sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64