# Chapter 6. 머신러닝 데이터 살펴보기

## 6.1 머신러닝에 사용할 데이터 소개

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as ply
from sklearn import datasets

### 6.1.1 집값 예측하기

In [2]:
# feature 데이터와 target 데이터 합치기
# -> pd.concat() 함수를 이용하여 합치고 나면 컬럼명이 사라지고 인덱스로 남음 
#    따라서 컬럼명을 붙여주는 추가적인 작업이 필요함 
raw_boston = datasets.load_boston()
X_boston = pd.DataFrame(raw_boston.data)
y_boston = pd.DataFrame(raw_boston.target)
df_boston = pd.concat([X_boston, y_boston], axis = 1)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function load_boston is deprecated; `load_boston` is deprecated in 1.0 and will be removed in 1.2.

    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_housing
        housing = fetch_california_housing()

    for the California housing dataset and::

        from sklearn.datasets import fetch_openml
        housing = fetch_openml(name="house_prices", as_frame=True)

    for the Ames housing dataset.
    
  warnings.warn(msg, category=FutureWarning)

In [3]:
# df_boston의 관측치 개수(row 개수) 확인
print(len(df_boston), "\n",  df_boston.shape)

506 
 (506, 14)


In [4]:
# 보스턴 데이터 info 확인
df_boston.info

<bound method DataFrame.info of           0     1      2    3      4      5     6       7    8      9     10  \
0    0.00632  18.0   2.31  0.0  0.538  6.575  65.2  4.0900  1.0  296.0  15.3   
1    0.02731   0.0   7.07  0.0  0.469  6.421  78.9  4.9671  2.0  242.0  17.8   
2    0.02729   0.0   7.07  0.0  0.469  7.185  61.1  4.9671  2.0  242.0  17.8   
3    0.03237   0.0   2.18  0.0  0.458  6.998  45.8  6.0622  3.0  222.0  18.7   
4    0.06905   0.0   2.18  0.0  0.458  7.147  54.2  6.0622  3.0  222.0  18.7   
..       ...   ...    ...  ...    ...    ...   ...     ...  ...    ...   ...   
501  0.06263   0.0  11.93  0.0  0.573  6.593  69.1  2.4786  1.0  273.0  21.0   
502  0.04527   0.0  11.93  0.0  0.573  6.120  76.7  2.2875  1.0  273.0  21.0   
503  0.06076   0.0  11.93  0.0  0.573  6.976  91.0  2.1675  1.0  273.0  21.0   
504  0.10959   0.0  11.93  0.0  0.573  6.794  89.3  2.3889  1.0  273.0  21.0   
505  0.04741   0.0  11.93  0.0  0.573  6.030  80.8  2.5050  1.0  273.0  21.0   

       

In [5]:
# 보스턴 데이터 head 확인
df_boston.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,0.1
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [6]:
# 피처의 컬럼명 추출
feature_boston = raw_boston.feature_names
print(feature_boston)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [7]:
# 컬럼명 넣기 
# -> 피처의 컬럼명 + 'target'
col_boston = np.append(feature_boston, ['target'])
df_boston.columns = col_boston
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### 6.1.2 꽃 구분하기

In [8]:
# feature 데이터와 target 데이터 합치기
# -> pd.concat() 함수를 이용하여 합치고 나면 컬럼명이 사라지고 인덱스로 남음 
#    따라서 컬럼명을 붙여주는 추가적인 작업이 필요함 
raw_iris = datasets.load_iris()
X_iris = pd.DataFrame(raw_iris.data)
y_iris = pd.DataFrame(raw_iris.target)
df_iris = pd.concat([X_iris, y_iris], axis = 1)

In [9]:
# 피처의 컬럼명 추출
feature_iris = raw_iris.feature_names
print(feature_iris)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [10]:
# 컬럼명 넣기 
# -> 피처의 컬럼명 + 'target'
col_iris = np.append(feature_iris, ['target'])
df_iris.columns = col_iris
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### 6.1.3 와인 구분하기

In [11]:
# feature 데이터와 target 데이터 합치기
# -> pd.concat() 함수를 이용하여 합치고 나면 컬럼명이 사라지고 인덱스로 남음 
#    따라서 컬럼명을 붙여주는 추가적인 작업이 필요함 
raw_wine = datasets.load_wine()
X_wine = pd.DataFrame(raw_wine.data)
y_wine = pd.DataFrame(raw_wine.target)
df_wine = pd.concat([X_wine, y_wine], axis = 1)

In [12]:
# 피처의 컬럼명 추출
feature_wine = raw_wine.feature_names
print(feature_wine)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [13]:
# 컬럼명 넣기 
# -> 피처의 컬럼명 + 'target'
col_wine = np.append(feature_wine, ['target'])
df_wine.columns = col_wine
df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


### 6.1.4 당뇨병 예측하기

In [14]:
# feature 데이터와 target 데이터 합치기
# -> pd.concat() 함수를 이용하여 합치고 나면 컬럼명이 사라지고 인덱스로 남음 
#    따라서 컬럼명을 붙여주는 추가적인 작업이 필요함 
raw_diab = datasets.load_diabetes()
X_diab = pd.DataFrame(raw_diab.data)
y_diab = pd.DataFrame(raw_diab.target)
df_diab = pd.concat([X_diab, y_diab], axis = 1)

In [15]:
# 피처의 컬럼명 추출
feature_diab = raw_diab.feature_names
print(feature_diab)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


In [16]:
# 컬럼명 넣기 
# -> 피처의 컬럼명 + 'target'
col_diab = np.append(feature_diab, ['target'])
df_diab.columns = col_diab
df_diab.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


### 6.1.5 유방암 예측하기

In [17]:
# feature 데이터와 target 데이터 합치기
# -> pd.concat() 함수를 이용하여 합치고 나면 컬럼명이 사라지고 인덱스로 남음 
#    따라서 컬럼명을 붙여주는 추가적인 작업이 필요함 
raw_bc = datasets.load_breast_cancer()
X_bc = pd.DataFrame(raw_bc.data)
y_bc = pd.DataFrame(raw_bc.target)
df_bc = pd.concat([X_bc, y_bc], axis = 1)
df_bc.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,0.1
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [18]:
# 피처의 컬럼명 추출
feature_bc = raw_bc.feature_names
print(feature_bc)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [19]:
# 컬럼명 넣기 
# -> 피처의 컬럼명 + 'target'
col_bc = np.append(feature_bc, ['target'])
df_bc.columns = col_bc
df_bc.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## 6.2 데이터 전처리

### 6.2.1 결측치 처리

In [20]:
import numpy as np
import pandas as pd

In [21]:
df = pd.DataFrame([
    [42, 'male', 12, 'reading', 'class2'],
    [35, 'unknown', 3, 'cooking', 'class1'],
    [1000, 'female', 7, 'cycling', 'calss3'],
    [1000, 'unknown', 21, 'unknown', 'unknown']
])

In [22]:
df.columns = ['age', 'gender', 'month_birth', 'hobby', 'target']

In [23]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42,male,12,reading,class2
1,35,unknown,3,cooking,class1
2,1000,female,7,cycling,calss3
3,1000,unknown,21,unknown,unknown


In [24]:
df['age'].unique()

array([  42,   35, 1000], dtype=int64)

In [26]:
df['gender'].unique()

array(['male', 'unknown', 'female'], dtype=object)

In [27]:
df['month_birth'].unique()

array([12,  3,  7, 21], dtype=int64)

In [28]:
df['hobby'].unique()

array(['reading', 'cooking', 'cycling', 'unknown'], dtype=object)

In [30]:
df['target'].unique()

array(['class2', 'class1', 'calss3', 'unknown'], dtype=object)

In [32]:
df.loc[df['age'] > 150, ['age']] = np.nan

In [33]:
df.loc[df['gender'] == 'unknown', ['gender']] = np.nan

In [34]:
df.loc[df['month_birth'] > 12, ['month_birth']] = np.nan

In [35]:
df.loc[df['hobby'] == 'unknown', ['hobby']] = np.nan

In [36]:
df.loc[df['target'] == 'unkown', ['target']] = np.nan

In [37]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,calss3
3,,,,,unknown


In [38]:
df.isnull().sum()

age            2
gender         2
month_birth    1
hobby          1
target         0
dtype: int64

In [39]:
# 결측치를 포함한 행(row) 삭제
df2 = df.dropna(axis = 0)
df2

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2


In [40]:
# 결측치를 포함한 열(column) 삭제
df3 = df.dropna(axis = 1)
df3

Unnamed: 0,target
0,class2
1,class1
2,calss3
3,unknown


In [41]:
# 모든 값이 결측치인 행 삭제
df4 = df.dropna(how = 'all')
df4

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,calss3
3,,,,,unknown


In [42]:
# 값이 2개 미만인 행 삭제
df5 = df.dropna(thresh = 2)
df5

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,calss3


In [43]:
# 특정 열에 결측치가 있는 경우 행 삭제
df6 = df.dropna(subset = ['gender'])
df6

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
2,,female,7.0,cycling,calss3


In [46]:
# 결측치 대체하기 
alter_values = {'age': 0, 
                'gender': 'U',
                'month_birth': 0,
                'hobby': 'U',
                'target': 'class4'}
df7 = df.fillna(value = alter_values)

In [47]:
df7

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,calss3
3,0.0,U,0.0,U,unknown


### 6.2.2 클래스 라벨 설정

In [48]:
from sklearn.preprocessing import LabelEncoder
df8 = df7
class_label = LabelEncoder()
data_value = df8['target'].values
y_new = class_label.fit_transform(data_value)
y_new

array([2, 1, 0, 3])

In [49]:
df8['target'] = y_new
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,2
1,35.0,U,3.0,cooking,1
2,0.0,female,7.0,cycling,0
3,0.0,U,0.0,U,3


In [50]:
y_ori = class_label.inverse_transform(y_new)
y_ori
df8['target'] = y_ori
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,calss3
3,0.0,U,0.0,U,unknown


In [66]:
# 클래스 라벨링
y_arr = df8['target'].values
y_arr.sort()
y_arr

array(['calss3', 'class1', 'class2', 'unknown'], dtype=object)

In [67]:
num_y = 0

In [68]:
dic_y = {}
for ith_y in y_arr:
    dic_y[ith_y] = num_y
    num_y += 1

In [69]:
dic_y

{'calss3': 0, 'class1': 1, 'class2': 2, 'unknown': 3}

In [70]:
df8['target'] = df8['target'].replace(dic_y)
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,0
1,35.0,U,3.0,cooking,1
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


### 6.2.3 원-핫 인코딩

In [71]:
# 판다스의 get_dummies() 함수를 이용한 더미변수 생성 
df9 = df8
df9['target'] = df9['target'].astype(str)
df10 = pd.get_dummies(df9['target'])
print(df10)

   0  1  2  3
0  1  0  0  0
1  0  1  0  0
2  0  0  1  0
3  0  0  0  1


In [73]:
# 벡터의 길이를 하나 줄인 벡터를 이용한 원-핫 인코딩
df9['target'] = df9['target'].astype(str)
df11 = pd.get_dummies(df9['target'], drop_first = True)
print(df11)

   1  2  3
0  0  0  0
1  1  0  0
2  0  1  0
3  0  0  1


In [74]:
# 특정 열이 아닌 데이터 프레임에 존재하는 모든 열에 대한 원-핫 인코딩
df12 = df8
df13 = pd.get_dummies(df12)
df13

Unnamed: 0,age,month_birth,gender_U,gender_female,gender_male,hobby_U,hobby_cooking,hobby_cycling,hobby_reading,target_0,target_1,target_2,target_3
0,42.0,12.0,0,0,1,0,0,0,1,1,0,0,0
1,35.0,3.0,1,0,0,0,1,0,0,0,1,0,0
2,0.0,7.0,0,1,0,0,0,1,0,0,0,1,0
3,0.0,0.0,1,0,0,1,0,0,0,0,0,0,1


#### (1) 사이킷런 라이브러리를 이용한 원-핫 인코딩

In [75]:
from sklearn.preprocessing import OneHotEncoder
hot_encoder = OneHotEncoder()
y = df7[['target']]
y_hot = hot_encoder.fit_transform(y)
print(y_hot.toarray())

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


1. 사이킷런 라이브러리에서 OneHotEncoder를 불러옴
2. 원-핫 인코더를 설정
3. 원-핫 인코딩할 변수 설정
4. fit_transform을 이용하여 주어진 데이터를 원-핫 인코딩함
5. 결과 확인

#### (2) 텐서플로 라이브러리를 이용한 원-핫 인코딩

In [76]:
from tensorflow.keras.utils import to_categorical
y_hotec = to_categorical(y)
print(y_hotec)

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


### 6.2.4 데이터 스케일링

#### (1) 표준화 스케일링

$$ \frac{x_i - \bar{x}}{\sigma}$$

In [78]:
# 표준화 스케일링
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(df8[['month_birth']])
x_std = std.transform(df8[['month_birth']])

1. 표준화 스케일링을 위한 StandardScaler 함수를 불러옴
2. 표준화 스케일러는 std라는 이름으로 정함
3. 표준화 스케일러에 month_birth 열을 적합시킴
4. 적합된 표준화 스케일러를 기준으로 month_birth 열 데이터 값을 변형시킴

In [79]:
x_std2 = std.fit_transform(df8[['month_birth']])
x_std2

array([[ 1.44444444],
       [-0.55555556],
       [ 0.33333333],
       [-1.22222222]])

In [80]:
print("Mean of x_std: ", np.mean(x_std))
print("Stadard deviation of x_std: ", np.std(x_std))

Mean of x_std:  -5.551115123125783e-17
Stadard deviation of x_std:  1.0


#### (2) 로버스트 스케일링

$$ \frac{x_i - q_2}{q_3 - q_1} $$

In [81]:
from sklearn.preprocessing import RobustScaler

robust = RobustScaler()
robust.fit(df8[['month_birth']])
x_robust = robust.transform(df8[['month_birth']])
x_robust

array([[ 1.16666667],
       [-0.33333333],
       [ 0.33333333],
       [-0.83333333]])

#### (3) 최소-최대 스케일링

$$ \frac{x_i - min(x)}{max(x) - min(x)} $$

In [82]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
minmax.fit(df8[['month_birth']])
x_minmax = minmax.transform(df8[['month_birth']])
x_minmax

array([[1.        ],
       [0.25      ],
       [0.58333333],
       [0.        ]])

#### (4) 노멀 스케일링

- 노멀 스케일링(normalizer)은 벡터의 유클라디안 길이가 1이 되도록 데이터값을 변경
- 노멀 스케일은 주로 벡터 길이는 상관없고, 방향(각도)만 고려할 때 사용
- 표준화, 로버스트, 최대-최소 스케일링과 다르게 행(row) 기준임
- 노멀 스케일링은 normalization이라고도 함

$$ new({x_i}) = \frac{x_i}{\sqrt{x_i^2 + y_i^2 + z_i^2}} $$

In [83]:
from sklearn.preprocessing import Normalizer

normal = Normalizer()
normal.fit(df8[['age', 'month_birth']])
x_normal = normal.transform(df8[['age', 'month_birth']])
x_normal

array([[0.96152395, 0.27472113],
       [0.99634665, 0.08540114],
       [0.        , 1.        ],
       [0.        , 0.        ]])

- 데이터 스케일링 과정에서 fit() 메소드는 트레이닝 데이터 셋에 대해서만 사용하며, 테스트 데이터 셋에는 fit()를 하지 않고 transform()만 사용함

- test 셋에 의한 scaling polution 방지
