# Pipeline
using tf.data api vs ImageDataGenerator

In [None]:
!pip install -q kaggle

In [None]:
!pip install split-folders

Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1


In [None]:
import tensorflow as tf

import os
import sys

import numpy as np
import pandas as pd

import splitfolders

from tensorflow.keras.applications.resnet50 import ResNet50

## 캐글에서 데이터셋 다운로드받기

In [None]:
# .kaggle 폴더 생성
!mkdir -p ~/.kaggle/
# kaggle.josn .kaggle로 복사
!cp kaggle.json ~/.kaggle/
# 복사 확인
!ls ~/.kaggle

kaggle.json


In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
#downloading dataset
!kaggle datasets download -d prasunroy/natural-images 
!unzip -q natural-images.zip

Downloading natural-images.zip to /content
100% 341M/342M [00:02<00:00, 172MB/s]
100% 342M/342M [00:02<00:00, 163MB/s]


In [None]:
BATCH_SIZE = 32
IMG_HEIGHT = 256
IMG_WIDTH = 256

In [None]:
# CPU 기반 스레드 동적 할당
AUTOTUNE = tf.data.experimental.AUTOTUNE 

In [None]:
dir_path = "natural_images"
CLASS_NAMES = np.array(os.listdir(dir_path))

In [None]:
CLASS_NAMES

array(['airplane', 'car', 'person', 'cat', 'fruit', 'motorbike', 'flower',
       'dog'], dtype='<U9')

## 저장된 폴더를 Train-Val-Test Split하기

In [None]:
splitfolders.ratio('natural_images', output='dataset', seed=77, ratio=(0.6, 0.2, 0.2))

Copying files: 6899 files [00:01, 4444.06 files/s]


In [None]:
n_train = 0

path = '/content/dataset/train'

for dir in os.listdir(path):
    n_train += len(os.listdir(path+'/'+dir))
n_train

4136

In [None]:
n_test = 0

path = '/content/dataset/test'

for dir in os.listdir(path):
    n_test += len(os.listdir(path+'/'+dir))
n_test

1386

In [None]:
n_val = 0

path = '/content/dataset/val'

for dir in os.listdir(path):
    n_val += len(os.listdir(path+'/'+dir))
n_val

1377

## ImageDataGenerator

In [None]:
###Image data Generator class
ImageFlow = tf.keras.preprocessing.image.ImageDataGenerator(
                                                            rotation_range=40,
                                                            width_shift_range=0.2,
                                                            height_shift_range=0.2,
                                                            shear_range=0.2,
                                                            zoom_range=0.2
                                                        )
##We are fitting the data to Image data generator.
# flow_from_directory : 폴더로부터 데이터 불러오기
ImageGenerator = ImageFlow.flow_from_directory(dir_path, target_size=(256,256),seed=10,batch_size=32)

Found 6899 images belonging to 8 classes.


### imagedataGenerator를 이용해 data load하는 시간 측정

In [None]:
import time

for t in range(3): 
    start = time.time()
    total_batches = 0

    batches = 0
    for x_batch, y_batch in ImageGenerator:
        batches += 1
        if batches >= n_train/BATCH_SIZE:
            total_batches += batches
            break 
    end = time.time()
    ex_time = end - start
    print("{} batches: {} s".format(total_batches, ex_time))
    print("{:0.5f} Images/s".format(BATCH_SIZE*total_batches/ex_time))

130 batches: 80.1123948097229 s
51.92705 Images/s
130 batches: 76.45631980895996 s
54.41015 Images/s
130 batches: 77.82978677749634 s
53.44997 Images/s


## tf.data

### Step 1 경로 안에 있는 대용량 데이터 불러오기 (feat. Dataset.list_files)

In [None]:
train_list = tf.data.Dataset.list_files(str('/content/dataset/'+'train'+'/*/*'),shuffle=False)
val_list = tf.data.Dataset.list_files(str('/content/dataset/'+'val'+'/*/*'),shuffle=False)
test_list = tf.data.Dataset.list_files(str('/content/dataset/'+'test'+'/*/*'),shuffle=False)

In [None]:
# list_ds 확인
for f in train_list.take(5):
    print(f.numpy())

b'/content/dataset/train/airplane/airplane_0001.jpg'
b'/content/dataset/train/airplane/airplane_0003.jpg'
b'/content/dataset/train/airplane/airplane_0006.jpg'
b'/content/dataset/train/airplane/airplane_0007.jpg'
b'/content/dataset/train/airplane/airplane_0010.jpg'


### Step 2 전처리를 위한 데이터셋 변환

In [None]:
def process_path(file_path):
    # load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    # a 3D uint8 tensor로 변환
    img = tf.image.decode_jpeg(img, channels=3)
    # float32로 변환
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize
    img = tf.image.resize(img, [IMG_WIDTH, IMG_HEIGHT])


    # 파일경로 -> 경로 단위로 split하여 리스트로
    # content/dataset/train/class1/image1.jpg -> [content, dataset, train, class1, image1.jpg]
    parts = tf.strings.split(file_path, os.path.sep)
    # 뒤에서 두번째 요소가 클래스별 폴더명
    label = parts[-2] == CLASS_NAMES
    return img, label

In [None]:
train_labeled = train_list.map(process_path, num_parallel_calls=AUTOTUNE)
val_labeled = val_list.map(process_path, num_parallel_calls=AUTOTUNE)
test_labeled = test_list.map(process_path, num_parallel_calls=AUTOTUNE)

In [None]:
def augment_image(image, label):
  im_shape = image.shape
  image=tf.image.random_flip_left_right(image)
  image=tf.image.random_flip_up_down(image)

  return image, tf.cast(label,tf.float32)

In [None]:
train_augmented = train_labeled.map(augment_image, num_parallel_calls=AUTOTUNE)
val_augmented = val_labeled.map(augment_image, num_parallel_calls=AUTOTUNE)
test_augmented = test_labeled.map(augment_image, num_parallel_calls=AUTOTUNE)

### Step 3 데이터셋 Iterate

In [None]:
def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
    # This is a small dataset, only load it once, and keep it in memory.
    # use `.cache(filename)` to cache preprocessing work for datasets that don't
    # fit in memory.
    if cache:
        if isinstance(cache, str):
            ds = ds.cache(cache)
        else:
            ds = ds.cache()

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    # Repeat forever
    ds = ds.repeat()

    ds = ds.batch(BATCH_SIZE)

    # `prefetch` lets the dataset fetch batches in the background while the model
    # is training.
    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

In [None]:
train_ds = prepare_for_training(train_augmented)
val_ds = prepare_for_training(val_augmented)
test_ds = prepare_for_training(test_augmented)

### Step 4. tf.data를 이용해 data load하는 시간 측정

In [None]:
for i in range(3):
    t = 0
    start = time.time()
    for x, y in train_ds.take(BATCH_SIZE):
        pass
    end = time.time()
    ex_time = end - start
    print("{} batches: {} s".format(total_batches, ex_time))
    print("{:0.5f} Images/s".format(BATCH_SIZE*total_batches/ex_time))

# imagedatagenerator와 비교
# 130 batches: 80.1123948097229 s
# 51.92705 Images/s
# 130 batches: 76.45631980895996 s
# 54.41015 Images/s
# 130 batches: 77.82978677749634 s
# 53.44997 Images/s

130 batches: 10.258490085601807 s
405.51777 Images/s
130 batches: 3.1857516765594482 s
1305.81427 Images/s
130 batches: 3.1746747493743896 s
1310.37046 Images/s


###  Step 5. Model Train and Test

In [None]:
def create_model():
    base_model = ResNet50(include_top=False, input_shape = (256, 256 ,3), weights = 'imagenet')

    inputs = tf.keras.Input(shape=(256, 256, 3))

    x = base_model(inputs, training=False) # batchnorm 부분 update 방지

    x = tf.keras.layers.Flatten(input_shape=base_model.output_shape[1:])(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x= tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(8, activation='softmax')(x)

    model = tf.keras.Model(inputs, outputs)

    model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='categorical_crossentropy',
    metrics=['acc'])

    return model

In [None]:
model = create_model()

model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 resnet50 (Functional)       (None, 8, 8, 2048)        23587712  
                                                                 
 flatten_2 (Flatten)         (None, 131072)            0         
                                                                 
 dense_4 (Dense)             (None, 256)               33554688  
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_5 (Dense)             (None, 8)                 2056      
                                                                 
Total params: 57,144,456
Trainable params: 57,091,336
Non-t

In [None]:
history = model.fit(train_ds,
                    epochs=10, 
                    steps_per_epoch=n_train/BATCH_SIZE,
                    validation_steps=n_val/BATCH_SIZE,
                    validation_data=val_ds)