## 1. 데이터 불러오기

In [1]:
import pandas as pd

path = "dbs/DataPreprocess.csv"
df1 = pd.read_csv(path)
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [2]:
x = df1[['Country', 'Age', 'Salary']]# 변수
x = x.values # numpy에서 사용하는 array
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [3]:
y = df1['Purchased'] # 결과
y = y.values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [4]:
x[:,1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, nan],
       [35.0, 58000.0],
       [nan, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

## 2. 누락된 데이터 처리

In [5]:
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(x[:,1:3]) # 맞춘다. (평균을 찾는다)
new_x = imputer.transform(x[:,1:3]) # 맞춘 것을 적용 (NaN에다가 평균을 넣는다.)
new_x

array([[4.40000000e+01, 7.20000000e+04],
       [2.70000000e+01, 4.80000000e+04],
       [3.00000000e+01, 5.40000000e+04],
       [3.80000000e+01, 6.10000000e+04],
       [4.00000000e+01, 6.37777778e+04],
       [3.50000000e+01, 5.80000000e+04],
       [3.87777778e+01, 5.20000000e+04],
       [4.80000000e+01, 7.90000000e+04],
       [5.00000000e+01, 8.30000000e+04],
       [3.70000000e+01, 6.70000000e+04]])

In [6]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
x[:,1:3] = imputer.fit_transform(x[:,1:3])
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## 3. K-Fold Cross Validation

1. 단순하게 데이터 나누기

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
                                    x, y, test_size=0.2, random_state=0)
x_train, y_train

(array([['Germany', 40.0, 63777.77777777778],
        ['France', 37.0, 67000.0],
        ['Spain', 27.0, 48000.0],
        ['Spain', 38.77777777777778, 52000.0],
        ['France', 48.0, 79000.0],
        ['Spain', 38.0, 61000.0],
        ['France', 44.0, 72000.0],
        ['France', 35.0, 58000.0]], dtype=object),
 array(['Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes'], dtype=object))

In [8]:
x_test, y_test

(array([['Germany', 30.0, 54000.0],
        ['Germany', 50.0, 83000.0]], dtype=object),
 array(['No', 'No'], dtype=object))

2. KFold Cross Validation 나누기

In [9]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=3, shuffle=True, random_state=0)
for train_index, test_index in cv.split(x):
    print(train_index)
    print(test_index)

[0 1 3 5 6 7]
[2 4 8 9]
[0 2 3 4 5 8 9]
[1 6 7]
[1 2 4 6 7 8 9]
[0 3 5]


## 4. 피쳐 스케일링(Feature Scaling)

In [10]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc = sc.fit(x[:,1:3])
new_x = sc.transform(x[:,1:3])
new_x



array([[ 7.58874362e-01,  7.49473254e-01],
       [-1.71150388e+00, -1.43817841e+00],
       [-1.27555478e+00, -8.91265492e-01],
       [-1.13023841e-01, -2.53200424e-01],
       [ 1.77608893e-01,  6.63219199e-16],
       [-5.48972942e-01, -5.26656882e-01],
       [ 0.00000000e+00, -1.07356980e+00],
       [ 1.34013983e+00,  1.38753832e+00],
       [ 1.63077256e+00,  1.75214693e+00],
       [-2.58340208e-01,  2.93712492e-01]])

In [11]:
sc = StandardScaler()
x[:,1:3] = sc.fit_transform(x[:,1:3])
x



array([['France', 0.758874361590019, 0.7494732544921677],
       ['Spain', -1.7115038793306814, -1.4381784072687531],
       ['Germany', -1.2755547779917342, -0.8912654918285229],
       ['Spain', -0.1130238410878753, -0.253200423814921],
       ['Germany', 0.17760889313808945, 6.632191985654332e-16],
       ['France', -0.5489729424268225, -0.5266568815350361],
       ['Spain', 0.0, -1.0735697969752662],
       ['France', 1.3401398300419485, 1.3875383225057696],
       ['Germany', 1.6307725642679132, 1.7521469327992565],
       ['France', -0.2583402082008577, 0.29371249162530916]], dtype=object)

## 5. 원 핫 인코딩(One-hot encoding)

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le = le.fit(x[:,0])
new_x = le.transform(x[:,0])
new_x

x[:,0] = new_x
x

array([[0, 0.758874361590019, 0.7494732544921677],
       [2, -1.7115038793306814, -1.4381784072687531],
       [1, -1.2755547779917342, -0.8912654918285229],
       [2, -0.1130238410878753, -0.253200423814921],
       [1, 0.17760889313808945, 6.632191985654332e-16],
       [0, -0.5489729424268225, -0.5266568815350361],
       [2, 0.0, -1.0735697969752662],
       [0, 1.3401398300419485, 1.3875383225057696],
       [1, 1.6307725642679132, 1.7521469327992565],
       [0, -0.2583402082008577, 0.29371249162530916]], dtype=object)

In [13]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
x = ohe.fit_transform(x).toarray()
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,0.758874,0.7494733
1,0.0,0.0,1.0,-1.711504,-1.438178
2,0.0,1.0,0.0,-1.275555,-0.8912655
3,0.0,0.0,1.0,-0.113024,-0.2532004
4,0.0,1.0,0.0,0.177609,6.632192e-16
5,1.0,0.0,0.0,-0.548973,-0.5266569
6,0.0,0.0,1.0,0.0,-1.07357
7,1.0,0.0,0.0,1.34014,1.387538
8,0.0,1.0,0.0,1.630773,1.752147
9,1.0,0.0,0.0,-0.25834,0.2937125


In [14]:
pd.DataFrame(x, columns=['France','Germany','Spain','Age','Salary'])

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1.0,0.0,0.0,0.758874,0.7494733
1,0.0,0.0,1.0,-1.711504,-1.438178
2,0.0,1.0,0.0,-1.275555,-0.8912655
3,0.0,0.0,1.0,-0.113024,-0.2532004
4,0.0,1.0,0.0,0.177609,6.632192e-16
5,1.0,0.0,0.0,-0.548973,-0.5266569
6,0.0,0.0,1.0,0.0,-1.07357
7,1.0,0.0,0.0,1.34014,1.387538
8,0.0,1.0,0.0,1.630773,1.752147
9,1.0,0.0,0.0,-0.25834,0.2937125


In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

## 6. 전처리 데이터 저장하기

In [18]:
df1 = pd.DataFrame(x, columns=['France','Germany','Spain','Age','Salary'])
df1['Purchased'] = y
df1.to_csv('dbs/DataProcess_.csv')
df1

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1.0,0.0,0.0,0.758874,0.7494733,0
1,0.0,0.0,1.0,-1.711504,-1.438178,1
2,0.0,1.0,0.0,-1.275555,-0.8912655,0
3,0.0,0.0,1.0,-0.113024,-0.2532004,0
4,0.0,1.0,0.0,0.177609,6.632192e-16,1
5,1.0,0.0,0.0,-0.548973,-0.5266569,1
6,0.0,0.0,1.0,0.0,-1.07357,0
7,1.0,0.0,0.0,1.34014,1.387538,1
8,0.0,1.0,0.0,1.630773,1.752147,0
9,1.0,0.0,0.0,-0.25834,0.2937125,1
