### Setting

In [1]:
import glob
import shutil
import os
import random

input_dir = "/opt/ml/input"
data_dir = os.path.join(input_dir, "data")
sample_dir = os.path.join(input_dir, "sample_data")

categories = ["Battery", "Clothing", "Glass", "Metal", "Paper", "Paperpack", "Plastic", "Plasticbag", "Styrofoam"]

In [2]:
# mask sample_data directory
os.makedirs(sample_dir, exist_ok=True)
os.makedirs(os.path.join(sample_dir, "test", "NoLabel"), exist_ok=True)

for category in categories:
    os.makedirs(os.path.join(sample_dir, "train", category), exist_ok=True)
    os.makedirs(os.path.join(sample_dir, "val", category), exist_ok=True)

### 데이터 개수 확인

In [3]:
print(f"{'Category':10} | train |   val | total")
print("--------------------------------------")
for category in categories:
    src_dir = os.path.join(data_dir, "train", category)
    train_imgs = glob.glob(os.path.join(src_dir, "*"))
    
    src_dir = os.path.join(data_dir, "val", category)
    val_imgs = glob.glob(os.path.join(src_dir, "*"))

    print(f"{category:10} | {len(train_imgs):5} | {len(val_imgs):5} | {len(train_imgs)+len(val_imgs)}")

Category   | train |   val | total
--------------------------------------
Battery    |   261 |    87 | 348
Clothing   |   676 |   225 | 901
Glass      |  1169 |   389 | 1558
Metal      |  1669 |   556 | 2225
Paper      |  6895 |  2298 | 9193
Paperpack  |  1458 |   486 | 1944
Plastic    |  3793 |  1264 | 5057
Plasticbag |  6970 |  2323 | 9293
Styrofoam  |  1560 |   520 | 2080


### 일부 데이터(sample data) 복사
- `/opt/ml/input/data` -> `/opt/ml/input/sample_data` 폴더
- train data
    - Battery  : 모두(261 개) 복사
    - Clothing : 모두(676 개) 복사
    - 그 외 : 1000개 복사
- val data
    - 복사한 train data의 30% 복사
- test data는 모두 복사

In [4]:
# copy test data (all)

src_dir = os.path.join(data_dir, "test", "NoLabel")
dst_dir = os.path.join(sample_dir, "test", "NoLabel")
for jpgfile in glob.iglob(os.path.join(src_dir, "*")):
    shutil.copy(jpgfile, dst_dir)

print("Done.")

Done.


In [None]:
# copy train data

num = 1000  # Battery, Clothing 외의 데이터 복사할 개수
for category in categories:
    # copy train data
    src_dir = os.path.join(data_dir, "train", category)
    dst_dir = os.path.join(sample_dir, "train", category)
        
    data = glob.glob(os.path.join(src_dir, "*"))
    # random.shuffle(data)
    for jpgfile in data[:num]:
        shutil.copy(jpgfile, dst_dir)

print("Done.")

In [None]:
# copy validation data

times = 0.3  # train data의 30% 복사

for category in categories:
    # copy validation data
    sample_train_dir = os.path.join(sample_dir, "train", category)
    src_dir = os.path.join(data_dir, "val", category)
    dst_dir = os.path.join(sample_dir, "val", category)
    
    sample_train_data = glob.glob(os.path.join(sample_train_dir, "*"))
    data = glob.glob(os.path.join(src_dir, "*"))
    # random.shuffle(data)
    num = int(len(sample_train_data)*times)
    for jpgfile in data[:num]:
        shutil.copy(jpgfile, dst_dir)
        
print("Done.")

### 복사된 데이터(sample data) 개수 확인

In [None]:
print(f"{'category':10} | train | validation")
print(f"{'-'*30}")

for category in categories:
    category_dir = os.path.join(sample_dir, "train", category)
    sample_data = glob.glob(os.path.join(category_dir, "*"))
    num = len(sample_data)
    print(f"{category:10} | {num:4}  |", end="")
    
    category_dir = os.path.join(sample_dir, "val", category)
    sample_data = glob.glob(os.path.join(category_dir, "*"))
    num = len(sample_data)
    print(f"{num:5}")

## 복사된 데이터(sample data) 일부 파일 명 확인

In [None]:
# Check train data
print(f"{'train':^33} | {'validation':^30}")
print(f"{'-'*66}")

num = 10  # 각 데이터 10개씩만 확인
for category in categories:
    print(category)
    category_dir = os.path.join(sample_dir, "train", category)
    sample_train_data = glob.glob(os.path.join(category_dir, "*"))
    
    category_dir = os.path.join(sample_dir, "val", category)
    sample_val_data = glob.glob(os.path.join(category_dir, "*"))
    for i, (train_img, val_img) in enumerate(zip(sample_train_data[:num], sample_val_data[:num])):
        train_fn = train_img.split('/')[-1]
        val_fn = val_img.split('/')[-1]
        print(f"{i:3}: {train_fn} | {val_fn}")
    print(f"{'-'*66}")