In [1]:
from PIL import Image
from tqdm import tqdm
import hashlib
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd
import Augmentor

from sklearn.model_selection import StratifiedKFold, KFold

In [2]:
AUGMENT_FACTOR = 3
IMG_SHAPE = (224, 224)

In [3]:
path = Path("dataset_updated")

In [4]:
img_list = []
for img in path.glob('*/*/*'):
    try:
        Image.open(img)
    except:
        continue
    cat = img.parts[-2]
    pathstr = str(img.absolute())
    img_id = hashlib.md5(pathstr.encode('utf-8')).hexdigest()
    img_list.append((img_id, pathstr, cat))
img_df = pd.DataFrame(img_list, columns=['id', 'path', 'category'])
img_df.index = img_df.id
img_df.drop('id', axis=1, inplace=True)
img_df.category = img_df.category.astype('category')
img_df.head()

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Unnamed: 0_level_0,path,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
96cb78a545ae9bff37048375762a43c4,/home/asm/dev/budokai/dataset_updated/validati...,iconography
c8c4ef86d86573cacb5991e362a4ac2d,/home/asm/dev/budokai/dataset_updated/validati...,iconography
1eac68a564617590992bae5e51fd4d8e,/home/asm/dev/budokai/dataset_updated/validati...,iconography
0ddef4a9979cb84a1855c6e5133bd88c,/home/asm/dev/budokai/dataset_updated/validati...,iconography
08d8de80b60f2373689afa401ad4c547,/home/asm/dev/budokai/dataset_updated/validati...,iconography


In [5]:
def apply_pipeline(df, sample_size):
    
    p = Augmentor.DataFramePipeline(df, 'path', 'category')
    p.resize(1, *IMG_SHAPE)
    
    samples = p.augmentor_images
    if sample_size > len(samples):
        extra = np.random.choice(p.augmentor_images, sample_size - len(samples))
        samples = np.concatenate((samples, extra), axis=0)
        
    with tqdm(total=len(samples), desc="Generating samples", unit="samples") as pbar:
        
        def process(img):
            aug_img = np.array(p._execute(img, save_to_disk=False))
            if aug_img.ndim < 3:
                aug_img = np.stack([aug_img]*3, -1)
            if aug_img.shape[-1] > 3:
                aug_img = aug_img[..., :3]
            assert aug_img.shape == (*IMG_SHAPE, 3), img.image_path
            pbar.update(1)
            return aug_img, img.categorical_label
        
        X, y = zip(*map(process, samples))
    
    return np.array(X), np.array(y)

In [6]:
skf = StratifiedKFold(5)
for i, (train, test) in enumerate(skf.split(img_df.path, img_df.category)):
    X_train, y_train = apply_pipeline(img_df.iloc[train], AUGMENT_FACTOR * len(train))
    X_test, y_test = apply_pipeline(img_df.iloc[test], AUGMENT_FACTOR * len(test))
    np.savez(f'fold{i}_data.npz', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
    print()

Generating samples:   0%|          | 25/20580 [00:00<01:23, 246.60samples/s]

Initialised with 6860 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 20580/20580 [00:50<00:00, 404.84samples/s]
Generating samples:   1%|          | 44/5151 [00:00<00:11, 427.76samples/s]

Initialised with 1717 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 5151/5151 [00:12<00:00, 416.29samples/s]





Generating samples:   0%|          | 43/20583 [00:00<00:48, 427.42samples/s]

Initialised with 6861 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 20583/20583 [00:50<00:00, 410.19samples/s]
Generating samples:   1%|          | 27/5148 [00:00<00:19, 266.20samples/s]

Initialised with 1716 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 5148/5148 [00:12<00:00, 404.04samples/s]





Generating samples:   0%|          | 44/20583 [00:00<00:48, 426.93samples/s]

Initialised with 6861 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 20583/20583 [00:49<00:00, 411.91samples/s]
Generating samples:   1%|          | 28/5148 [00:00<00:18, 276.94samples/s]

Initialised with 1716 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 5148/5148 [00:13<00:00, 395.44samples/s]





Generating samples:   0%|          | 44/20586 [00:00<00:48, 427.31samples/s]

Initialised with 6862 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 20586/20586 [00:50<00:00, 409.64samples/s]
Generating samples:   1%|          | 31/5145 [00:00<00:16, 300.97samples/s]

Initialised with 1715 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 5145/5145 [00:12<00:00, 409.39samples/s]





Generating samples:   0%|          | 44/20592 [00:00<00:48, 424.93samples/s]

Initialised with 6864 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 20592/20592 [00:49<00:00, 414.82samples/s]
Generating samples:   1%|          | 34/5139 [00:00<00:15, 323.40samples/s]

Initialised with 1713 image(s) found.
Output directory set to output.

Generating samples: 100%|██████████| 5139/5139 [00:12<00:00, 415.91samples/s]



