#### Tensorflow에서 데이터셋 관련 모듈
- tensorflow.data 서브 모듈
- 데이터를 모델에 적합한 형태의 Tensor로 변환해주는 기능의 모듈
- Dataset 클래스 사용
    * 구성 : 피쳐 + 라벨 [지도], 피쳐[비지도/강화]

[1] 모듈 로딩 및 데이터 준비<hr>

In [1]:
#모듈 로딩
import tensorflow as tf #텐서 관련
from tensorflow.data import Dataset # 데이터셋 관련
import numpy as np

In [2]:
# 데이터 준비 -> 임의의 데이터 생성 
data = tf.constant([[[10,12,14,16],[1,3,5,7]], [[10,12,14,16],[1,3,5,7]]])
data

<tf.Tensor: shape=(2, 2, 4), dtype=int32, numpy=
array([[[10, 12, 14, 16],
        [ 1,  3,  5,  7]],

       [[10, 12, 14, 16],
        [ 1,  3,  5,  7]]])>

[2] 데이터셋 텐서 생성 2가지 방법 <hr>
* 단일 Dataset 생성 : Dataset.from_tensors()
* 여러개 Dataset 생성 : Dataset.from_tensor_slices()

In [3]:
#데이터셋 생성 - 방법(1) from_tensors()
dataset0 = Dataset.from_tensors(data)
print(dataset0)

<TensorDataset element_spec=TensorSpec(shape=(2, 2, 4), dtype=tf.int32, name=None)>


In [4]:
#데이터셋은 반복 기능 즉, iterable 기능 -> 내부에 반복자 iterator
for ts in dataset0:
    print(ts)

tf.Tensor(
[[[10 12 14 16]
  [ 1  3  5  7]]

 [[10 12 14 16]
  [ 1  3  5  7]]], shape=(2, 2, 4), dtype=int32)


In [5]:
#데이터셋 생성 - 방법(2) from_tensor_slices()
dataset1 = Dataset.from_tensor_slices(data)
print(dataset1)

<TensorSliceDataset element_spec=TensorSpec(shape=(2, 4), dtype=tf.int32, name=None)>


In [6]:
for ts in dataset1:
    print(ts)

tf.Tensor(
[[10 12 14 16]
 [ 1  3  5  7]], shape=(2, 4), dtype=int32)
tf.Tensor(
[[10 12 14 16]
 [ 1  3  5  7]], shape=(2, 4), dtype=int32)


[3] 데이터셋에서 데이터 추출<hr>

In [7]:
for data in dataset0:
    print(data)

tf.Tensor(
[[[10 12 14 16]
  [ 1  3  5  7]]

 [[10 12 14 16]
  [ 1  3  5  7]]], shape=(2, 2, 4), dtype=int32)


In [8]:
for data in dataset0.as_numpy_iterator():
    print(data, type(data))

[[[10 12 14 16]
  [ 1  3  5  7]]

 [[10 12 14 16]
  [ 1  3  5  7]]] <class 'numpy.ndarray'>


In [9]:
for data in dataset1.as_numpy_iterator():
    print(data, type(data))

[[10 12 14 16]
 [ 1  3  5  7]] <class 'numpy.ndarray'>
[[10 12 14 16]
 [ 1  3  5  7]] <class 'numpy.ndarray'>


In [10]:
### 텐서 내부의 데이터 섞어서 처리 ==> shuffle()
data =tf.constant( [11,13,15,17] )
dataset0=Dataset.range(3000)
#for data in dataset0.as_numpy_iterator(): print(data, end=' ')

In [17]:
# 원소 섞어주기 
dataset0=dataset0.shuffle(2, seed=5)

# 원소 읽기
for data in dataset0.as_numpy_iterator(): print(data, end=' ')

# dataset의 ndarray => list로 형변환
print( list( dataset0.as_numpy_iterator() ))

1 2 3 4 5 0 7 6 9 10 11 8 13 12 14 15 16 17 18 19 20 21 23 24 25 22 27 26 29 30 28 32 33 31 34 35 36 37 39 38 41 42 40 44 43 45 46 48 47 49 51 52 53 50 55 54 56 58 59 57 60 62 63 64 61 65 66 67 69 68 71 70 72 74 75 76 73 78 77 80 81 82 83 84 85 86 87 79 89 88 90 92 93 94 91 96 95 97 99 100 98 102 101 104 103 105 107 106 108 109 111 110 112 113 115 116 117 118 119 120 121 122 123 114 125 124 126 127 129 128 131 132 133 130 134 135 137 136 139 138 141 142 140 143 144 146 145 148 149 147 150 151 152 153 155 156 154 158 159 160 157 162 161 163 165 164 167 166 169 170 171 168 172 173 174 175 176 177 179 180 178 181 183 184 182 186 187 188 185 189 191 192 190 193 195 196 197 198 199 194 200 201 203 202 205 204 206 207 209 208 211 210 212 213 215 216 217 214 218 220 221 219 223 224 225 222 227 228 226 230 229 232 231 234 233 236 237 235 238 240 241 242 243 244 245 239 247 246 248 249 250 252 251 253 255 256 257 254 258 260 261 259 263 264 265 266 267 268 262 269 270 272 271 273 274 275 276 27

In [18]:
### 원하는 갯수만큼 묶어서 추출 ==> batch() 메서드
# - batch_size : 묶을 데이터 수 
# - drop_remainder = False 지정된 수 만큼 묶음으로 나누고 남는 데이터 처리 

print( list( dataset0.as_numpy_iterator() ))

[1, 0, 3, 2, 4, 5, 6, 7, 9, 10, 8, 11, 12, 14, 15, 16, 17, 18, 19, 13, 20, 22, 21, 24, 23, 26, 25, 28, 29, 27, 30, 31, 33, 34, 35, 36, 32, 38, 39, 40, 37, 41, 43, 42, 45, 46, 44, 47, 48, 50, 51, 52, 49, 53, 55, 56, 57, 54, 58, 60, 59, 62, 63, 64, 65, 61, 66, 68, 69, 67, 71, 72, 70, 73, 75, 76, 77, 74, 79, 80, 78, 82, 83, 81, 85, 86, 84, 88, 89, 87, 90, 92, 91, 94, 93, 96, 97, 95, 98, 100, 99, 102, 101, 104, 103, 105, 107, 108, 109, 110, 106, 111, 113, 114, 115, 112, 117, 118, 119, 120, 116, 122, 121, 124, 123, 125, 126, 128, 127, 129, 131, 132, 130, 133, 135, 134, 137, 138, 139, 140, 136, 142, 143, 144, 145, 141, 147, 148, 149, 146, 151, 152, 150, 154, 155, 156, 153, 158, 159, 157, 161, 160, 162, 164, 165, 166, 167, 168, 169, 163, 171, 172, 170, 174, 175, 173, 177, 176, 179, 178, 180, 182, 181, 183, 184, 186, 185, 187, 189, 188, 191, 190, 192, 193, 194, 195, 197, 196, 199, 200, 201, 198, 203, 202, 205, 206, 204, 208, 207, 209, 211, 210, 213, 212, 214, 215, 216, 217, 218, 219, 221, 222,

In [11]:
list(dataset0.batch(8).as_numpy_iterator())

[array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64),
 array([ 8,  9, 10, 11, 12, 13, 14, 15], dtype=int64),
 array([16, 17, 18, 19, 20, 21, 22, 23], dtype=int64),
 array([24, 25, 26, 27, 28, 29, 30, 31], dtype=int64),
 array([32, 33, 34, 35, 36, 37, 38, 39], dtype=int64),
 array([40, 41, 42, 43, 44, 45, 46, 47], dtype=int64),
 array([48, 49, 50, 51, 52, 53, 54, 55], dtype=int64),
 array([56, 57, 58, 59, 60, 61, 62, 63], dtype=int64),
 array([64, 65, 66, 67, 68, 69, 70, 71], dtype=int64),
 array([72, 73, 74, 75, 76, 77, 78, 79], dtype=int64),
 array([80, 81, 82, 83, 84, 85, 86, 87], dtype=int64),
 array([88, 89, 90, 91, 92, 93, 94, 95], dtype=int64),
 array([ 96,  97,  98,  99, 100, 101, 102, 103], dtype=int64),
 array([104, 105, 106, 107, 108, 109, 110, 111], dtype=int64),
 array([112, 113, 114, 115, 116, 117, 118, 119], dtype=int64),
 array([120, 121, 122, 123, 124, 125, 126, 127], dtype=int64),
 array([128, 129, 130, 131, 132, 133, 134, 135], dtype=int64),
 array([136, 137, 138, 139, 140, 

In [12]:
list(dataset0.batch(3).as_numpy_iterator())

[array([0, 1, 2], dtype=int64),
 array([3, 4, 5], dtype=int64),
 array([6, 7, 8], dtype=int64),
 array([ 9, 10, 11], dtype=int64),
 array([12, 13, 14], dtype=int64),
 array([15, 16, 17], dtype=int64),
 array([18, 19, 20], dtype=int64),
 array([21, 22, 23], dtype=int64),
 array([24, 25, 26], dtype=int64),
 array([27, 28, 29], dtype=int64),
 array([30, 31, 32], dtype=int64),
 array([33, 34, 35], dtype=int64),
 array([36, 37, 38], dtype=int64),
 array([39, 40, 41], dtype=int64),
 array([42, 43, 44], dtype=int64),
 array([45, 46, 47], dtype=int64),
 array([48, 49, 50], dtype=int64),
 array([51, 52, 53], dtype=int64),
 array([54, 55, 56], dtype=int64),
 array([57, 58, 59], dtype=int64),
 array([60, 61, 62], dtype=int64),
 array([63, 64, 65], dtype=int64),
 array([66, 67, 68], dtype=int64),
 array([69, 70, 71], dtype=int64),
 array([72, 73, 74], dtype=int64),
 array([75, 76, 77], dtype=int64),
 array([78, 79, 80], dtype=int64),
 array([81, 82, 83], dtype=int64),
 array([84, 85, 86], dtype=in

<hr>

- [MNIST 내장 데이터셋 사용해서 실습]

In [13]:
#모듈 로딩
from tensorflow.keras.datasets import mnist

In [14]:
#내장 데이터셋 로딩
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [15]:
#다중 데이터셋 인스턴스 생성 -> 지도학습으로 피쳐와 라벨이 하나로 묶음
train_XY = Dataset.from_tensor_slices((X_train, y_train))

In [16]:
#데이터셋으로 첫번째 데이터셋 요소만 추출
for data, label in train_XY:
    print(data.shape, label)
    break

(28, 28) tf.Tensor(5, shape=(), dtype=uint8)


In [19]:
for data, label in train_XY.shuffle(10000).batch(1000):
    print(data.shape, label)

(1000, 28, 28) tf.Tensor(
[1 1 7 9 7 0 9 8 5 9 2 7 2 6 9 5 0 7 2 1 9 8 1 5 8 1 2 7 6 4 9 0 1 0 2 5 0
 6 8 7 8 6 3 4 3 4 9 0 3 3 7 7 1 9 0 2 7 4 9 6 7 5 3 2 0 9 4 9 1 3 5 6 5 3
 8 0 9 4 0 7 5 7 5 6 4 5 7 2 2 4 5 1 3 0 8 6 8 4 3 0 4 8 4 6 7 9 6 1 0 0 5
 6 9 5 7 7 7 4 9 1 2 4 2 1 3 7 4 3 8 0 2 7 8 6 4 5 6 4 8 9 7 0 5 5 3 1 9 6
 5 5 8 7 7 0 7 2 3 9 7 2 2 3 2 5 3 2 1 6 1 0 0 8 0 6 1 0 9 7 0 7 4 5 9 0 1
 4 2 8 3 8 6 4 0 0 9 2 6 9 2 5 3 3 5 3 1 7 4 4 6 2 9 4 9 1 1 6 1 9 9 6 3 3
 6 9 3 8 5 4 1 3 9 0 2 4 3 7 6 8 0 1 4 5 7 7 2 4 8 5 4 6 3 1 2 2 8 7 9 7 1
 8 0 1 7 1 4 3 8 7 2 4 9 7 1 1 9 9 3 7 3 4 3 9 3 8 0 5 2 6 5 8 7 2 6 9 5 5
 1 0 5 9 1 9 9 9 2 7 5 7 9 7 3 9 3 6 4 8 5 1 7 5 6 1 9 1 0 0 8 7 8 0 7 8 0
 1 1 9 0 5 6 4 4 2 2 5 3 7 6 5 8 1 8 9 6 6 3 3 2 7 9 4 4 5 0 2 0 3 6 4 1 2
 3 4 8 3 6 0 0 4 0 2 0 6 5 5 6 1 8 4 6 5 2 2 3 5 0 7 1 7 1 2 8 1 9 6 4 7 2
 8 6 7 3 3 7 8 7 1 3 1 2 4 4 2 4 6 1 3 0 4 0 3 6 3 4 6 7 3 0 6 3 2 9 2 1 6
 1 0 2 1 1 0 7 7 6 2 6 8 4 7 0 3 0 4 9 8 9 3 6 9 7 9 8 2 2 6 0 2 1 2 1 3 1