## **Preprocessing**

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import sklearn as sk
sk.__version__

'1.4.1.post1'

### Data Load

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# data load
path = './data/DataPreprocess.csv'
df1 = pd.read_csv(path)
df1.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [2]:
df1.shape

(10, 4)

In [5]:
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### **1. data/label 나누기**

In [4]:
x = df1.values[:, :-1] # data
y = df1.values[:, -1] # label

x, y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

### **2. Preprocessing - Missing Data 처리**
#### 1. Pandas DF의 메소드 이용해서 null 처리
- isna() , fillna(), dropna()

In [6]:
# 1. data load
df_new = pd.DataFrame(x)
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [8]:
# 2. null값 검색
# df_new.isna()
df_new.isna().sum()

0    0
1    1
2    1
dtype: int64

- fillna()로 null값 대체 : 0으로

In [9]:
df_new = df_new.fillna(0)
df_new

  df_new = df_new.fillna(0)


Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,0.0
5,France,35.0,58000.0
6,Spain,0.0,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


- fillna()로 null대체 : 평균값으로 대체
- 각 컬럼의 mean()값으로 대체

In [17]:
df_new = pd.DataFrame(x)
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [18]:
# index 1 컬럼 대체
# df_new.loc[:,1].mean() : 열의 평균
df_new.loc[:, 1] = df_new.loc[:, 1].fillna(df_new.loc[:,1].mean())

  df_new.loc[:, 1] = df_new.loc[:, 1].fillna(df_new.loc[:,1].mean())


In [19]:
# index 2 컬럼 대체
df_new.loc[:, 2] = df_new.loc[:, 2].fillna(df_new.loc[:,2].mean())

  df_new.loc[:, 2] = df_new.loc[:, 2].fillna(df_new.loc[:,2].mean())


In [20]:
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


#### **2. scikit learn 의 SimpleImputer class 이용해서 null 데이터 처리**
**str 클래스의 null 대치값**
- strategy='mean' 평균값으로 대치(default)
- strategy='median' 중앙값으로 대치
- strategy='most_frequent' 최빈값(mode)로 대치
- strategy='constant', fill_value=1 특정값으로 대치
- transformer = SimpleImputer(strategy='constant', fill_value=1)

## 데이터 전처리 - Feature scaling
- 1) 표준화 standardization -> StandardScaler
    - (xi - mean(x)) / stcev(x)
  2) 정규화 Normalization -> MinMaxScaler
      - (xi -min(x)) / (max(x)-min(x))


### 1) Standardization 표준화 - 사이킷런 StandardScaler class

In [5]:
x = df1.values[:, :-1]
y = df1.values[:, -1] # label
x, y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3]) # transform(): 결과를 numpy array로 리턴
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### 1)표준화

In [7]:
from sklearn.preprocessing import StandardScaler # 표준화 지원 클래스

sc_x = StandardScaler()
sc_x.fit_transform(x[:, 1:3])  # 1,2 컬럼만 표준화
x[:, 1:3] = sc_x.transform(x[:, 1:3])
x

array([['France', 0.758874361590019, 0.7494732544921677],
       ['Spain', -1.7115038793306814, -1.4381784072687531],
       ['Germany', -1.2755547779917342, -0.8912654918285229],
       ['Spain', -0.1130238410878753, -0.253200423814921],
       ['Germany', 0.17760889313808945, 6.632191985654332e-16],
       ['France', -0.5489729424268225, -0.5266568815350361],
       ['Spain', 0.0, -1.0735697969752662],
       ['France', 1.3401398300419485, 1.3875383225057696],
       ['Germany', 1.6307725642679132, 1.7521469327992565],
       ['France', -0.2583402082008577, 0.29371249162530916]], dtype=object)

In [8]:
# 평균이 0(0에 가깝고)이고 표준편차가 1인 정규분포로 바뀌었는지 확인
print('평균: ', x[:,1].mean(), x[:, 2].mean())
print('표준편차: ', x[:,1].var(), x[:,2].var())

평균:  -8.881784197001253e-17 4.274358644806853e-16
표준편차:  1.0 1.0000000000000002


### 2) 정규화 - 사이킷런의 MinMaxScaler

In [13]:
x = df1.values[:, :-1]
y = df1.values[:, -1]
x, y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

In [14]:
# Null값 처리
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3]) # transform(): 결과를 numpy array로 리턴
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [15]:
# 정규화(Normalization)
# 서로 다른 feature 크기 통일하기 위해 크기변환
from sklearn.preprocessing import MinMaxScaler

mmsc_x = MinMaxScaler()
mmsc_x.fit_transform(x[:, 1:3])  # 1,2번째 피처를 Normalization
x[:,1:3] = mmsc_x.transform(x[:,1:3])
x

array([['France', 0.7391304347826089, 0.6857142857142855],
       ['Spain', 0.0, 0.0],
       ['Germany', 0.1304347826086958, 0.17142857142857149],
       ['Spain', 0.4782608695652175, 0.37142857142857144],
       ['Germany', 0.5652173913043479, 0.45079365079365075],
       ['France', 0.34782608695652173, 0.2857142857142856],
       ['Spain', 0.5120772946859904, 0.11428571428571432],
       ['France', 0.9130434782608696, 0.8857142857142857],
       ['Germany', 1.0, 1.0],
       ['France', 0.43478260869565233, 0.5428571428571427]], dtype=object)

In [17]:
# feature 크기 통일 되었나 각 컬럼 최대 최소 확인
print('최소값 : ', x[:,1].min(), x[:, 2].min())
print('최대값 : ', x[:,1].max(), x[:, 2].max())

최소값 :  0.0 0.0
최대값 :  1.0 1.0


## 데이터 인코딩 - 레이블/원핫
- 카테고리 피처 -> 코드형 숫자

#### 데이터 로드

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# data load
path = "./data/DataPreprocess.csv"
df1 = pd.read_csv(path)
df1.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


#### data/label나누기

In [19]:
x = df1.values[:, :-1]
y = df1.values[:, -1]
x, y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

#### **1)LabelEncoder**

In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(x[:,0])
new_x = le.transform(x[:,0])
new_x

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [22]:
# 원본 인코딩 클래스를 볼수 있다.
le.classes_

array(['France', 'Germany', 'Spain'], dtype=object)

In [23]:
# 디코딩 원본 값을 보여준다.
le.inverse_transform([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object)

In [21]:
x[:,0] = new_x  # 첫번째 컬럼 Label encoding한 값으로 대체
pd.DataFrame(x, columns=['Country', 'Age', 'Salary'])

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,
5,0,35.0,58000.0
6,2,,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


#### **2) One-Hot Encoding**
- 1. 레이블 인코딩(결과: 1차원)
  2. 2차원 데이터 형태로 변환
  3. 원-핫 인코딩

In [28]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# 1) Label Encoding
le = LabelEncoder()
le.fit(x[:, 0])
new_x = le.transform(x[:, 0]) 
new_x
# 결과 : array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0]) 1차원

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [29]:
# 2) 2차원 데이터 형태로 변환 (세로로 됨)
new_x = new_x.reshape(-1,1)
new_x

array([[0],
       [2],
       [1],
       [2],
       [1],
       [0],
       [2],
       [0],
       [1],
       [0]])

In [30]:
# 3) 원핫 인코딩
ohe = OneHotEncoder()
ohe.fit(new_x)
new_ohe = ohe.transform(new_x)
new_ohe.toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [31]:
new_ohe.toarray().shape

(10, 3)

In [32]:
pd.DataFrame(new_ohe.toarray(), columns = ['French', 'Germany', 'Spain'])

Unnamed: 0,French,Germany,Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0
5,1.0,0.0,0.0
6,0.0,0.0,1.0
7,1.0,0.0,0.0
8,0.0,1.0,0.0
9,1.0,0.0,0.0


### **3) pandas의 get_dummies()로 원핫인코딩 구현**
- 원핫인코딩이 생각보다 사이킷런에서 쉽지 않음 그럴 때 이것 사용
- 숫자형 값으로 변환없이도 바로 원핫 인코딩 가능

In [33]:
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [36]:
df1.iloc[:,0]

0     France
1      Spain
2    Germany
3      Spain
4    Germany
5     France
6      Spain
7     France
8    Germany
9     France
Name: Country, dtype: object

In [35]:
import pandas as pd

pd.get_dummies(df1.iloc[:, 0])
# true / false (1,0) 원핫인코딩해줌

Unnamed: 0,France,Germany,Spain
0,True,False,False
1,False,False,True
2,False,True,False
3,False,False,True
4,False,True,False
5,True,False,False
6,False,False,True
7,True,False,False
8,False,True,False
9,True,False,False
