# 훈련 데이터(Train data)와 테스트 데이터(Test data) 나누기

## 필요한 라이브러리 import

In [1]:
# In this lesson we will explore the train_test_split module
# Therefore we need no more than the module itself and NumPy
import numpy as np
from sklearn.model_selection import train_test_split

실습에 필요한 샘플 데이터를 생성한다.

In [2]:
# Let's generate a new data frame 'a' which will contain all integers from 1 to 100
# The method np.arange works like the built-in method 'range' with the difference it creates an array
a = np.arange(1,101)

In [3]:
# Let's check it out
a

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [4]:
# Similarly, let's create another ndarray 'b', which will contain integers from 501 to 600
# We have intentionally picked these numbers so we can easily compare the two
# Obviously, the difference between the elements of the two arrays is 500 for any two corresponding elements
b = np.arange(501,601)
b

array([501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513,
       514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
       527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539,
       540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552,
       553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565,
       566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578,
       579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591,
       592, 593, 594, 595, 596, 597, 598, 599, 600])

## Split the data

train_test_split()메소드를 사용하면 데이터를 훈련/테스트 데이터로 분리할 수 있다.

참고: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [5]:
# Let's check out how this works
train_test_split(a)

[array([ 58,  68,  72,  46,  60,  47,  10,  66,   8,  99,  84,  13,  28,
         67,  53,  16,  43,  91,  31,  52,  80,  59,   1,  57,  45,  87,
         88,  24,  12,  11,  95,  70,  93,   6,  79,  97,  73,  44,  34,
         63,  38,   7,   5,  81,  78,  98,  65,  48,  39,  17,  54,  32,
         69,  30,  14,   2,  19,  94,  50,  86,  51,  25,  29,  92,  27,
         82,  71,  83, 100,  23,  77,  36,  56,  15,  62]),
 array([21, 35,  4, 76, 49, 64, 20, 85, 40, 41,  9, 42, 96, 37, 90,  3, 55,
        74, 33, 89, 75, 22, 61, 26, 18])]

# 다음과 같은 옵션을 확인하자

**train_test_split함수의 parameter**

* test_size : 테스트 데이터셋의 비율(float)이나 갯수(int) (default = 0.25)
* train_size : 학습 데이터셋의 비율(float)이나 갯수(int) (default = test_size의 나머지)
* random_state : 데이터 분할시 셔플이 이루어지는데 이를 위한 시드값 (int나 RandomState로 입력)
* shuffle : 셔플여부설정 (default = True)
* stratify : 지정한 Data의 비율을 유지한다. 예를 들어, Label Set인 Y가 25%의 0과 75%의 1로 이루어진 Binary Set일 때, stratify=Y로 설정하면 나누어진 데이터셋들도 0과 1을 각각 25%, 75%로 유지한 채 분할된다.



In [6]:
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=365)

## 나눈 결과 확인

In [7]:
# Let's check the shapes
# Basically, we are checking how does the 'test_size' work
a_train.shape, a_test.shape

((80,), (20,))

In [8]:
# Explore manually
a_train

array([ 25,  32,  99,  73,  91,  66,   3,  59,  94,   1,   8,  15,  90,
        54,  31,  20,  77,  82,  30,  35,  95,  42,  38,   7,  11,  50,
        21,  48,   2,  17,  10,  58,  68,  43,  41,  16,  88,  72,  79,
       100,  80,  39,  24,  86,  22,  23,  62,  76,  18,  47,  55,  26,
        60,  19,  71,  64,  51,  63,  65,  28,  12,  78,  13,  44,  75,
        87,  40,   4,  29,  49,  37,  57,  27,  74,   6,  45,  92,  34,
        53,  83])

In [9]:
# Explore manually
a_test

array([ 9, 69, 81, 56, 33, 93, 84, 61, 46, 89, 85, 67, 97,  5, 70, 36, 98,
       96, 14, 52])

In [10]:
b_train.shape, b_test.shape

((80,), (20,))

In [11]:
b_train

array([525, 532, 599, 573, 591, 566, 503, 559, 594, 501, 508, 515, 590,
       554, 531, 520, 577, 582, 530, 535, 595, 542, 538, 507, 511, 550,
       521, 548, 502, 517, 510, 558, 568, 543, 541, 516, 588, 572, 579,
       600, 580, 539, 524, 586, 522, 523, 562, 576, 518, 547, 555, 526,
       560, 519, 571, 564, 551, 563, 565, 528, 512, 578, 513, 544, 575,
       587, 540, 504, 529, 549, 537, 557, 527, 574, 506, 545, 592, 534,
       553, 583])

In [12]:
b_test

array([509, 569, 581, 556, 533, 593, 584, 561, 546, 589, 585, 567, 597,
       505, 570, 536, 598, 596, 514, 552])

##  데이터셋 나누기

wine dataset을 읽어 들이고 이 데이터셋을 훈련 데이터와 테스트 데이터로 나누어 본다.

In [1]:
from sklearn.datasets import load_wine

wine = load_wine()
X = wine.data
y = wine.target

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [2]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [13]:
print(X.shape, y.shape)

(178, 13) (178,)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, X_test.shape)

(124, 13) (54, 13)


In [7]:
y_test.shape

(54,)