In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/thanhbinh/code')
from butils.base import print_progress
from butils.imutils import load_image_file, resize

import os
from glob import glob
import numpy as np
import pandas as pd
import tensorflow as tf
import h5py

In [3]:
data_dir = 'data/original/'
out_dir = 'data/'
image_size = (224, 224)

In [4]:
filepaths = [f for f in glob(os.path.join(data_dir, '**'), recursive=True)
             if os.path.isfile(f) and os.path.splitext(f)[-1].lower() in ['.jpg', '.png']]
relpaths = [os.path.relpath(f, data_dir) for f in filepaths]
relpaths[:5]

['No_RA/IMG_3445.JPG',
 'No_RA/IMG_2914.JPG',
 'No_RA/IMG_2855.JPG',
 'No_RA/IMG_1388.JPG',
 'No_RA/IMG_0903.JPG']

In [5]:
classnames = [f.split('/', 1)[0] for f in relpaths]
filenames = [f.split('/', 1)[1] for f in relpaths]
df = pd.DataFrame(data=zip(filenames, classnames), columns=['filename', 'classname'])
df.head()

Unnamed: 0,filename,classname
0,IMG_3445.JPG,No_RA
1,IMG_2914.JPG,No_RA
2,IMG_2855.JPG,No_RA
3,IMG_1388.JPG,No_RA
4,IMG_0903.JPG,No_RA


In [6]:
test_df = None
train_df = None
for classname in df['classname'].unique():
    test_tmp_df = df[df['classname']==classname].sample(50, replace=False)
    train_tmp_df = df[(df['classname']==classname) & ~(df['filename'].isin(test_tmp_df['filename']))]
    test_df = test_tmp_df if test_df is None else pd.concat([test_df, test_tmp_df])
    train_df = train_tmp_df if train_df is None else pd.concat([train_df, train_tmp_df])

print(train_df.shape, test_df.shape)

(1258, 2) (100, 2)


In [7]:
def load_images(df):
    X = np.zeros((df.shape[0], image_size[0], image_size[1], 3))

    for i, (filename, classname) in enumerate(df[['filename', 'classname']].values):
        fin = os.path.join(data_dir, classname, filename)
        print_progress(f'[{i+1}/{df.shape[0]}] {fin}')
        img = load_image_file(fin, mode='RGB')
        img = resize(img, image_size, keep_aspect=False, crop=False)
        X[i] = img
    
    return X

X_train = load_images(train_df)
y_train = np.array([0 if x=='No_RA' else 1 for x in train_df['classname'].values])

X_test = load_images(test_df)
y_test = np.array([0 if x=='No_RA' else 1 for x in test_df['classname'].values])

[100/100] data/original/RA/IMG_5671.JPG   

In [8]:
h5f = h5py.File(os.path.join(out_dir, 'RockAI_test_50_images.h5'), 'w')
h5f['X_train'] = X_train
h5f['y_train'] = y_train
h5f['X_test'] = X_test
h5f['y_test'] = y_test
h5f.close()

In [9]:
train_df.to_csv(os.path.join(out_dir, 'RockAI_test_50_images_trainset.csv'), index=False, header=True)
test_df.to_csv(os.path.join(out_dir, 'RockAI_test_50_images_testset.csv'), index=False, header=True)