--- 
4.1 특성 스케일 바꾸기

--- 

In [57]:
## 시이킷런의 MinMaxScaler를 사용해 특성 배열의 스케일을 조정
import numpy as np 
from sklearn import preprocessing 

In [58]:
feature = np.array([[-500.5], 
                    [-100.1], 
                    [0], 
                    [100.1], 
                    [900.9]])

feature

array([[-500.5],
       [-100.1],
       [   0. ],
       [ 100.1],
       [ 900.9]])

In [59]:
## Min-Max scaling  
minmax_scale = preprocessing.MinMaxScaler(feature_range = (0,1))
scaled_feature = minmax_scale.fit_transform(feature)

scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

--- 
4.2 특성을 표준화하기 

--- 

In [60]:
## 표준 정규분포로 변환하기 
x = np.array([ [-1000.1], [-200.3], [500.5], [600.6], [9000.9] ])

scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(x)

standardized

array([[-0.76057496],
       [-0.54179224],
       [-0.35009065],
       [-0.32270862],
       [ 1.97516647]])

In [61]:
## check 
print(round(standardized.mean()))
print(round(standardized.std()))

0
1


In [62]:
##참고. 데이터에 이상치가 많은 경우 표준화가 잘 안됨 -> RobustScaler (중앙값과 사분위 범위를 사용하여 스케일 조정
x = np.array([ [-1000.1], [-200.3], [500.5], [600.6], [9000.9] ])

robust_scaler = preprocessing.RobustScaler()
r_scaled_feature = robust_scaler.fit_transform(x)

r_scaled_feature

array([[-1.87364215],
       [-0.87501561],
       [ 0.        ],
       [ 0.12498439],
       [10.61355975]])

--- 
4.3 정규화 하기 

--- 

In [63]:
from sklearn.preprocessing import Normalizer 

In [64]:
feature2 = np.array([[0.5, 0.5],
                     [1.1, 3.4],
                     [1.5, 20.2],
                     [1.63, 34.4],
                     [10.9, 3.3]])

Normalizer(norm='l2').transform(feature2)
## norm = l2 -> uclidean norm, 통상 많이 사용된다. 
## norm = l1 ->  맨해튼 또는 택시 norm, 직선거리가 아닌 상하좌우로 한칸씩 이동하는 방식 
## norm = max -> 각 행의 최대값으로 나눈다. 

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

--- 
4.5 특성 변환하기 

--- 

In [65]:
from sklearn.preprocessing import FunctionTransformer

In [66]:
## 함수를 적용시켜 변환하기 
feature3 = np.array([[2,3], 
                     [3,4], 
                     [4,5] ]) 

def add_ten(x) : 
  return x + 10 

FunctionTransformer(add_ten).transform(feature3)

array([[12, 13],
       [13, 14],
       [14, 15]])

In [67]:
##참고. 위는 pandas, apply를 사용하여 더 간단하게 할 수도 있다. 
import pandas as pd 
df = pd.DataFrame(feature3, columns = ['A', 'B'])
df

Unnamed: 0,A,B
0,2,3
1,3,4
2,4,5


In [68]:
df.apply(add_ten) 

Unnamed: 0,A,B
0,12,13
1,13,14
2,14,15


---
4.6 이상치 감지하기 

--- 

In [69]:
from sklearn.covariance import EllipticEnvelope 
from sklearn.datasets import make_blobs 

In [70]:
features, _ = make_blobs(n_samples = 10, 
                         n_features = 2, 
                         centers = 1, 
                         random_state = 1)

features

array([[-1.83198811,  3.52863145],
       [-2.76017908,  5.55121358],
       [-1.61734616,  4.98930508],
       [-0.52579046,  3.3065986 ],
       [ 0.08525186,  3.64528297],
       [-0.79415228,  2.10495117],
       [-1.34052081,  4.15711949],
       [-1.98197711,  4.02243551],
       [-2.18773166,  3.33352125],
       [-0.19745197,  2.34634916]])

In [71]:
## 극단적인 값들을 생성
features[0,0] = 100000 
features[0,1] = 100000

features 

array([[ 1.00000000e+05,  1.00000000e+05],
       [-2.76017908e+00,  5.55121358e+00],
       [-1.61734616e+00,  4.98930508e+00],
       [-5.25790464e-01,  3.30659860e+00],
       [ 8.52518583e-02,  3.64528297e+00],
       [-7.94152277e-01,  2.10495117e+00],
       [-1.34052081e+00,  4.15711949e+00],
       [-1.98197711e+00,  4.02243551e+00],
       [-2.18773166e+00,  3.33352125e+00],
       [-1.97451969e-01,  2.34634916e+00]])

In [72]:
## 이상치 감지 객체를 만들어 적용시키고, 이상치 예측에 사용 
outlier_detector = EllipticEnvelope(contamination = 0.1)
outlier_detector.fit(features)

outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

--- 
4.7 이상치 다루기 

--- 

In [73]:
import numpy as np 
import pandas as pd 

houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

houses

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500
3,4322032,116.0,48000


In [74]:
## outlier를 표시하기 
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [75]:
## Square_Feet 컬럼에 대해 로그변환 
houses['Log_of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


--- 
4.8 특성 이산화 하기 

--- 

In [76]:
from sklearn.preprocessing import Binarizer

# 특성을 만듭니다.
age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])

# 특정값을 기준으로 이산화 하기 
binarizer = Binarizer(threshold=18).fit_transform(age)

In [77]:
## np.digitize로써 위와 동일하게 이산화 하기 
np.digitize(age, bins = [18])

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [78]:
# 여러개의 값을 기준으로 이산화 하기 
np.digitize(age, bins = [20, 30, 40])

array([[0],
       [0],
       [1],
       [2],
       [3]])

--- 
4.9 군집으로 샘플을 그룹핑하기

--- 

In [79]:
import numpy as np
import pandas as pd 
from sklearn.datasets import make_blobs 
from sklearn.cluster import KMeans 

In [80]:
features, _ = make_blobs(n_samples = 50, 
                         n_features = 2, 
                         centers = 3, 
                         random_state = 1)

features 

array([[ -9.87755355,  -3.33614544],
       [ -7.28721033,  -8.35398617],
       [ -6.94306091,  -7.0237442 ],
       [ -7.44016713,  -8.79195851],
       [ -6.64138783,  -8.07588804],
       [ -0.79415228,   2.10495117],
       [ -2.76017908,   5.55121358],
       [ -9.94690475,  -4.59034419],
       [ -0.52579046,   3.3065986 ],
       [ -1.98197711,   4.02243551],
       [ -5.8659643 ,  -7.96807169],
       [ -6.83478745,  -7.39121692],
       [ -6.74924724, -10.17542932],
       [-10.75211044,  -2.70048039],
       [ -8.50899599,  -8.65769397],
       [ -2.33080604,   4.39382527],
       [ -0.19745197,   2.34634916],
       [  0.08525186,   3.64528297],
       [-10.20660674,  -3.36672536],
       [ -9.15872909,  -3.02224647],
       [ -1.34052081,   4.15711949],
       [ -1.83198811,   3.52863145],
       [ -9.80679702,  -1.85309341],
       [ -0.75870396,   3.72276201],
       [-11.1402307 ,  -4.30269127],
       [ -7.8121371 ,  -5.34984488],
       [ -2.35122066,   4.00973634],
 

In [81]:
df = pd.DataFrame(features, columns = ['x', 'y'])
## K Means clustering (k=3) 
clusterer = KMeans(3, random_state = 0)
clusterer.fit(features)

df['group'] = clusterer.predict(features)
df.head(10)

Unnamed: 0,x,y,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2
5,-0.794152,2.104951,1
6,-2.760179,5.551214,1
7,-9.946905,-4.590344,0
8,-0.52579,3.306599,1
9,-1.981977,4.022436,1


--- 
4.10 누락된 값을 가진 샘플 삭제하기 

--- 

In [82]:
features_w_nan = np.array([[1.1, 11.1], 
                           [2.2, 22.2], 
                           [3.3, 33.3], 
                           [4.4, 44.4], 
                           [np.nan, 55]])

features_w_nan

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4],
       [ nan, 55. ]])

In [83]:
df_w_nan = pd.DataFrame(features_w_nan, columns = ['X1', 'X2'])
df_w_nan

Unnamed: 0,X1,X2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4
4,,55.0


In [84]:
df_w_nan.dropna()

Unnamed: 0,X1,X2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


--- 
4.11 누락된 값 채우기 
 
--- 

In [85]:
from sklearn.impute import SimpleImputer
from sklearn.datasets import make_blobs 
from sklearn.preprocessing import StandardScaler 

In [86]:
features, _ = make_blobs(n_samples = 1000, 
                         n_features = 2, 
                         random_state = 1)

features 

array([[-3.05837272,  4.48825769],
       [-8.60973869, -3.72714879],
       [ 1.37129721,  5.23107449],
       ...,
       [-1.91854276,  4.59578307],
       [-1.79600465,  4.28743568],
       [-6.97684609, -8.89498834]])

In [87]:
## 결측치가 포함된 데이터 만들기  
standarized_features = StandardScaler().fit_transform(features)
standarized_features[0, 0] = np.nan 
standarized_features

array([[        nan,  1.31426523],
       [-0.67073178, -0.22369263],
       [ 2.1048424 ,  1.45332359],
       ...,
       [ 1.18998798,  1.33439442],
       [ 1.22406396,  1.27667052],
       [-0.21664919, -1.19113343]])

In [90]:
## 사이킷런을 이용하여 결측치를 채우기
'''
strategy 옵션
    'mean': 평균값 (디폴트)
    'median': 중앙값
    'most_frequent': 최빈값
    'constant': 특정값, 예 SimpleImputer(strategy='constant', fill_value=1)
''' 
features_simple_imputed = SimpleImputer(strategy='mean').fit_transform(standarized_features)
#features_simple_imputed = SimpleImputer(strategy='constant', fill_value=1).fit_transform(standarized_features)
features_simple_imputed

array([[-8.73892504e-04,  1.31426523e+00],
       [-6.70731775e-01, -2.23692628e-01],
       [ 2.10484240e+00,  1.45332359e+00],
       ...,
       [ 1.18998798e+00,  1.33439442e+00],
       [ 1.22406396e+00,  1.27667052e+00],
       [-2.16649193e-01, -1.19113343e+00]])