## **Preprocessing**

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import sklearn as sk
sk.__version__

'1.4.1.post1'

### Data Load

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# data load
path = './data/DataPreprocess.csv'
df1 = pd.read_csv(path)
df1.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [2]:
df1.shape

(10, 4)

In [5]:
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### **1. data/label 나누기**

In [4]:
x = df1.values[:, :-1] # data
y = df1.values[:, -1] # label

x, y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

### **2. Preprocessing - Missing Data 처리**
#### 1. Pandas DF의 메소드 이용해서 null 처리
- isna() , fillna(), dropna()

In [6]:
# 1. data load
df_new = pd.DataFrame(x)
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [8]:
# 2. null값 검색
# df_new.isna()
df_new.isna().sum()

0    0
1    1
2    1
dtype: int64

- fillna()로 null값 대체 : 0으로

In [9]:
df_new = df_new.fillna(0)
df_new

  df_new = df_new.fillna(0)


Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,0.0
5,France,35.0,58000.0
6,Spain,0.0,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


- fillna()로 null대체 : 평균값으로 대체
- 각 컬럼의 mean()값으로 대체

In [17]:
df_new = pd.DataFrame(x)
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [18]:
# index 1 컬럼 대체
# df_new.loc[:,1].mean() : 열의 평균
df_new.loc[:, 1] = df_new.loc[:, 1].fillna(df_new.loc[:,1].mean())

  df_new.loc[:, 1] = df_new.loc[:, 1].fillna(df_new.loc[:,1].mean())


In [19]:
# index 2 컬럼 대체
df_new.loc[:, 2] = df_new.loc[:, 2].fillna(df_new.loc[:,2].mean())

  df_new.loc[:, 2] = df_new.loc[:, 2].fillna(df_new.loc[:,2].mean())


In [20]:
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


#### **2. scikit learn 의 SimpleImputer class 이용해서 null 데이터 처리**
**str 클래스의 null 대치값**
- strategy='mean' 평균값으로 대치(default)
- strategy='median' 중앙값으로 대치
- strategy='most_frequent' 최빈값(mode)로 대치
- strategy='constant', fill_value=1 특정값으로 대치
- transformer = SimpleImputer(strategy='constant', fill_value=1)