# ✅ 훈련 데이터셋과 테스트 데이터셋으로 나누기

In [146]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/wine.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,good
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad


## 📌Target 변수 인코딩

In [147]:
pd.get_dummies(df['quality'], prefix ='quality')

Unnamed: 0,quality_bad,quality_good
0,1,0
1,1,0
2,1,0
3,0,1
4,1,0
...,...,...
1594,1,0
1595,0,1
1596,0,1
1597,1,0


### ❗ 기존 DataFrame에 원-핫 인코딩을 적용한 특성을 반영

In [148]:
df_wine = pd.get_dummies(data = df, columns = ['quality'], prefix = 'quality')
df_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality_bad,quality_good
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,1,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,1,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,1,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0,1
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,1,0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,0,1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,0,1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,1,0


### 📌훈련셋, 테스트셋 분할

> * train_test_split을 사용해 랜덤하게 훈련 데이터셋과 테스트 데이터셋 분할

> * 테스트 데이터셋에 많은 데이터 주면 안됨

> * 실전에서 많이 사용하는 비율 = 60:40, 70:30, 80:20

In [149]:
X = np.arange(20).reshape(10,2)
X

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [12, 13],
       [14, 15],
       [16, 17],
       [18, 19]])

In [150]:
#예제로 사용할 2차원 numpy array와 1차원 numpy array 생성
y = np.arange(10)
y

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### 1️⃣순차적으로 분할

* 1. test_size = 전체 데이터에서 test data 세트의 크기
> ex) (....test_size = 0.2)같은 경우 80%의 train, 20%의 test 데이터 세트 추출

* 2. shuffle = 데이터를 순차적으로 분리할지, 무작위로 분리할지 설정, (기본값 = True)
> ex) (...shuffle = False)같은 경우 순차적으로 분할

* 3. random_state = random 값 고정하는 역할
> ex) 1 ~ 100의 값을 랜덤으로 5개 불러 3,2,1,5,4를 불렀을 때, random_state 지정하면
> 다음에도 같은 순서로 5개를 불러옴

>  어떤 값을 지정해도 상관없음

* 4. stratify = 원래 데이터의 분포와 유사하게 추출

In [151]:
from sklearn.model_selection import train_test_split

#shuffle = false
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.4,
                                                   shuffle = False
                                                   )
print('1)훈련용 Feature 데이터셋:',X_train.shape)
print('2)테스트용 Feature 데이터셋:',X_test.shape)
print('3)훈련용 Label 데이터셋:',y_train.shape)
print('4)테스트용 Label 데이터셋:',y_test.shape)

1)훈련용 Feature 데이터셋: (6, 2)
2)테스트용 Feature 데이터셋: (4, 2)
3)훈련용 Label 데이터셋: (6,)
4)테스트용 Label 데이터셋: (4,)


In [152]:
X_train

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])

In [153]:
X_test

array([[12, 13],
       [14, 15],
       [16, 17],
       [18, 19]])

In [154]:
y_train

array([0, 1, 2, 3, 4, 5])

In [155]:
y_test

array([6, 7, 8, 9])

### 2️⃣ 무작위 추출로 분할

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.4,
                                                   shuffle = True,
                                                   random_state =1004)

In [157]:
X_train

array([[ 2,  3],
       [ 8,  9],
       [ 6,  7],
       [14, 15],
       [10, 11],
       [ 4,  5]])

In [158]:
X_test

array([[ 0,  1],
       [12, 13],
       [16, 17],
       [18, 19]])

In [159]:
y_train

array([1, 4, 3, 7, 5, 2])

In [160]:
y_test

array([0, 6, 8, 9])

### 3️⃣ 계층적 데이터 추출 옵션

In [165]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.4,
                                                   shuffle = True,
                                                   random_state =1004,
                                                   stratify = y)

### 4️⃣ 와인 데이터셋으로 실습

In [164]:
X, y = df_wine.iloc[:,1:].values, df_wine.iloc[:,0].values

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.3,
                                                   random_state = 1,
                                                   )
X_train

array([[ 1.33 ,  0.   ,  1.7  , ..., 10.9  ,  1.   ,  0.   ],
       [ 0.49 ,  0.1  ,  2.6  , ..., 10.5  ,  1.   ,  0.   ],
       [ 0.84 ,  0.   ,  1.4  , ...,  9.7  ,  1.   ,  0.   ],
       ...,
       [ 0.725,  0.09 ,  5.5  , ..., 10.8  ,  0.   ,  1.   ],
       [ 0.63 ,  0.   ,  1.9  , ...,  9.   ,  0.   ,  1.   ],
       [ 0.4  ,  0.5  ,  1.8  , ..., 12.5  ,  0.   ,  1.   ]])

# ✅특성 스케일 맞추기

### 1️⃣ 정규화

 1. 최소-최대 스케일 변환
 
> * 특성의 스케일을 [0,1]범위에 맞춤

> * 각 특성의 열마다 적용

> * **특정 범위의 값이 필요할 때 유요하게 사용할 수 있음**

> * 사이킷런의 MinMaxScalar 사용

In [169]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_train_norm

array([[0.18181818, 0.45      , 0.07534247, ..., 0.12307692, 0.        ,
        1.        ],
       [0.32231405, 0.64      , 0.10273973, ..., 0.52307692, 0.        ,
        1.        ],
       [0.55371901, 0.04      , 0.0890411 , ..., 0.30769231, 0.        ,
        1.        ],
       ...,
       [0.46694215, 0.        , 0.10958904, ..., 0.33846154, 0.        ,
        1.        ],
       [0.23140496, 0.49      , 0.10958904, ..., 0.15384615, 1.        ,
        0.        ],
       [0.15702479, 0.32      , 0.06849315, ..., 0.64615385, 0.        ,
        1.        ]])

### 2️⃣표준화

> * 표준화 사용시 특성의 평균을 0에 맞추고 표준편차를 1로 만들어 0과 1사이의 값을 가지게 하는 정규분포의 특징을 갖도록함

> * 사이킷런의 StandardScalar 사용

In [171]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_train_std

array([[-1.08973838,  0.89845403, -0.41016183, ..., -1.15025682,
        -0.96414598,  0.96414598],
       [-0.12369587,  1.85737689, -0.13354777, ...,  1.29914922,
        -0.96414598,  0.96414598],
       [ 1.46743297, -1.17080054, -0.2718548 , ..., -0.01976172,
        -0.96414598,  0.96414598],
       ...,
       [ 0.87075966, -1.37267904, -0.06439425, ...,  0.16865412,
        -0.96414598,  0.96414598],
       [-0.7487822 ,  1.10033253, -0.06439425, ..., -0.96184097,
         1.03718734, -1.03718734],
       [-1.26021647,  0.24234892, -0.47931534, ...,  2.05281261,
        -0.96414598,  0.96414598]])