### Setting

In [1]:
import glob
import shutil
import os
import random

input_dir = "/opt/ml/input"
data_dir = os.path.join(input_dir, "data")
sample_dir = os.path.join(input_dir, "sample_data")

categories = ["Battery", "Clothing", "Glass", "Metal", "Paper", "Paperpack", "Plastic", "Plasticbag", "Styrofoam"]

In [2]:
# mask sample_data directory
os.makedirs(sample_dir, exist_ok=True)
os.makedirs(os.path.join(sample_dir, "train"), exist_ok=True)
os.makedirs(os.path.join(sample_dir, "val"), exist_ok=True)

for category in categories:
    os.makedirs(os.path.join(sample_dir, "train", category), exist_ok=True)
    os.makedirs(os.path.join(sample_dir, "test", category), exist_ok=True)
    os.makedirs(os.path.join(sample_dir, "val", category), exist_ok=True)

### 데이터 개수 확인

In [3]:
print(f"{'Category':10} | train |   val | total")
print("--------------------------------------")
for category in categories:
    src_dir = os.path.join(data_dir, "train", category)
    train_imgs = glob.glob(os.path.join(src_dir, "*"))
    
    src_dir = os.path.join(data_dir, "val", category)
    val_imgs = glob.glob(os.path.join(src_dir, "*"))

    print(f"{category:10} | {len(train_imgs):5} | {len(val_imgs):5} | {len(train_imgs)+len(val_imgs)}")

Category   | train |   val | total
--------------------------------------
Battery    |   261 |    87 | 348
Clothing   |   676 |   225 | 901
Glass      |  1169 |   389 | 1558
Metal      |  1669 |   556 | 2225
Paper      |  6895 |  2298 | 9193
Paperpack  |  1458 |   486 | 1944
Plastic    |  3793 |  1264 | 5057
Plasticbag |  6970 |  2323 | 9293
Styrofoam  |  1560 |   520 | 2080


### 일부 데이터(sample data) 복사
- `/opt/ml/input/data` -> `/opt/ml/input/sample_data` 폴더
- train/val data는 category별 비율 유지하면서 일부 복사
- test data는 모두 복사

In [6]:
times = 0.1  # 각 category의 총 데이터 개수의 times배 만큼 추출

for category in categories:
    # copy train data
    src_dir = os.path.join(data_dir, "train", category)
    dst_dir = os.path.join(sample_dir, "train", category)
    data = glob.glob(os.path.join(src_dir, "*"))
    random.shuffle(data)  # shuffle
    num = int(len(data)*times)
    for jpgfile in data[:num]:
        shutil.copy(jpgfile, dst_dir)
    
    # copy validation data
    src_dir = os.path.join(data_dir, "val", category)
    dst_dir = os.path.join(sample_dir, "val", category)
    data = glob.glob(os.path.join(src_dir, "*"))
    random.shuffle(data)  # shuffle
    num = int(len(data)*times)
    for jpgfile in data[:num]:
        shutil.copy(jpgfile, dst_dir)
        
    # copy test data (all)
    src_dir = os.path.join(data_dir, "test", category)
    dst_dir = os.path.join(sample_dir, "test", category)
    for jpgfile in glob.iglob(os.path.join(src_dir, "*")):
        shutil.copy(jpgfile, dst_dir)
        
print("Done.")

Done.


### 복사된 데이터(sample data) 개수 확인

In [7]:
print(f"Copy {times*100}% of each category\n")
print(f"{'category':10} | train | validation")
print(f"{'-'*30}")

for category in categories:
    category_dir = os.path.join(sample_dir, "train", category)
    sample_data = glob.glob(os.path.join(category_dir, "*"))
    num = len(sample_data)
    print(f"{category:10} | {num:4}  |", end="")
    
    category_dir = os.path.join(sample_dir, "val", category)
    sample_data = glob.glob(os.path.join(category_dir, "*"))
    num = len(sample_data)
    print(f"{num:5}")

Copy 10.0% of each category

category   | train | validation
------------------------------
Battery    |   26  |    8
Clothing   |   67  |   22
Glass      |  116  |   38
Metal      |  166  |   55
Paper      |  689  |  229
Paperpack  |  145  |   48
Plastic    |  379  |  126
Plasticbag |  697  |  232
Styrofoam  |  156  |   52
