In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import sys
# sys.path.append('/home/thanhbinh/code')
# from butils.base import print_progress
# from butils.imutils import load_image_file, resize

from glob import glob
import numpy as np
import pandas as pd
import tensorflow as tf
import h5py

from PIL import Image

In [3]:
data_dir = 'data/original/'
out_dir = 'data/'
image_size = (299, 299)

In [4]:
filepaths = [f for f in glob(os.path.join(data_dir, '**'), recursive=True)
             if os.path.isfile(f) and os.path.splitext(f)[-1].lower() in ['.jpg', '.png']]
relpaths = [os.path.relpath(f, data_dir) for f in filepaths]
relpaths[:5]

['No_RA/IMG_5932.JPG',
 'No_RA/IMG_2821.JPG',
 'No_RA/IMG_2801.JPG',
 'No_RA/IMG_6875.JPG',
 'No_RA/IMG_2791.JPG']

In [5]:
classnames = [f.split('/', 1)[0] for f in relpaths]
filenames = [f.split('/', 1)[1] for f in relpaths]
df = pd.DataFrame(data=zip(filenames, classnames), columns=['filename', 'classname'])
df.head()

Unnamed: 0,filename,classname
0,IMG_5932.JPG,No_RA
1,IMG_2821.JPG,No_RA
2,IMG_2801.JPG,No_RA
3,IMG_6875.JPG,No_RA
4,IMG_2791.JPG,No_RA


In [6]:
test_df = None
train_df = None
for classname in df['classname'].unique():
    test_tmp_df = df[df['classname']==classname].sample(50, replace=False, random_state=1234)
    train_tmp_df = df[(df['classname']==classname) & ~(df['filename'].isin(test_tmp_df['filename']))]
    test_df = test_tmp_df if test_df is None else pd.concat([test_df, test_tmp_df])
    train_df = train_tmp_df if train_df is None else pd.concat([train_df, train_tmp_df])

print(train_df.shape, test_df.shape)

(1358, 2) (100, 2)


In [7]:
def load_images(df):
    X = np.zeros((df.shape[0], image_size[0], image_size[1], 3))

    for i, (filename, classname) in enumerate(df[['filename', 'classname']].values):
        fin = os.path.join(data_dir, classname, filename)
        print(f'[{i+1}/{df.shape[0]}] {fin}')
#         img = load_image_file(fin, mode='RGB')
#         img = resize(img, image_size, keep_aspect=False, crop=False)
        img = Image.open(fin)
        img = img.resize(image_size)
        X[i] = np.array(img)
    
    return X

X_train = load_images(train_df)
y_train = np.array([0 if x=='No_RA' else 1 for x in train_df['classname'].values])

X_test = load_images(test_df)
y_test = np.array([0 if x=='No_RA' else 1 for x in test_df['classname'].values])

[1/1358] data/original/No_RA/IMG_5932.JPG
[2/1358] data/original/No_RA/IMG_2801.JPG
[3/1358] data/original/No_RA/IMG_6875.JPG
[4/1358] data/original/No_RA/IMG_2800.JPG
[5/1358] data/original/No_RA/IMG_2785.JPG
[6/1358] data/original/No_RA/IMG_1025.JPG
[7/1358] data/original/No_RA/IMG_2877.JPG
[8/1358] data/original/No_RA/IMG_2881.JPG
[9/1358] data/original/No_RA/IMG_2885.JPG
[10/1358] data/original/No_RA/IMG_2856.JPG
[11/1358] data/original/No_RA/IMG_2774.JPG
[12/1358] data/original/No_RA/IMG_2883.JPG
[13/1358] data/original/No_RA/IMG_2837.JPG
[14/1358] data/original/No_RA/IMG_2913.JPG
[15/1358] data/original/No_RA/IMG_2786.JPG
[16/1358] data/original/No_RA/P1000629.JPG
[17/1358] data/original/No_RA/IMG_2905.JPG
[18/1358] data/original/No_RA/IMG_2784.JPG
[19/1358] data/original/No_RA/IMG_2914.JPG
[20/1358] data/original/No_RA/IMG_2768.JPG
[21/1358] data/original/No_RA/IMG_2923.JPG
[22/1358] data/original/No_RA/IMG_2859.JPG
[23/1358] data/original/No_RA/IMG_2886.JPG
[24/1358] data/origi

In [8]:
h5f = h5py.File(os.path.join(out_dir, 'RockAI_299x299_testset50_images.h5'), 'w')
h5f['X_train'] = X_train
h5f['y_train'] = y_train
h5f['X_test'] = X_test
h5f['y_test'] = y_test
h5f.close()

In [9]:
train_df.to_csv(os.path.join(out_dir, 'RockAI_299x299_testset50_trainset.csv'), index=False, header=True)
test_df.to_csv(os.path.join(out_dir, 'RockAI_299x299_testset50_testset.csv'), index=False, header=True)