# PreProcessing Images to generate a Dataset

In [1]:
!pip install google-cloud-storage tensorflow --quiet

In [1]:
import os

from google.cloud import storage
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer
from datetime import datetime as dt
from multiprocessing import Pool
from skimage.io import imread
from tensorflow.keras.applications import vgg16
import numpy as np
import json

from tensorflow.python.lib.io import file_io
import _pickle as pickle

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/jovyan/work/bigdata-217213-55b1dfc31b66.json'

In [3]:
train_id = dt.now().strftime('%Y%m%d_%H%M%S')

In [4]:
basepath = 'gs://bigdata-allanbatista-com-br/image-classifier/{}/'.format(train_id)
basepath

'gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/'

In [5]:
gs = storage.Client()
bucket = gs.bucket('bigdata-allanbatista-com-br')

def list_blobs(pattern):
    return [blob.name for blob in bucket.list_blobs(prefix=pattern)]

def list_images_with_labels(pattern):
    x = []
    y = []
    for path in list_blobs(pattern):
        x.append(path)
        y.append(path.split("/")[2])
    
    return x, y

In [6]:
train_paths, train_labels = list_images_with_labels("dataset/train")
test_paths, test_labels = list_images_with_labels("dataset/test")

In [7]:
binarizer = LabelBinarizer()
y_train = binarizer.fit_transform(train_labels)
y_test = binarizer.transform(test_labels)

with file_io.FileIO('{}binarizer.pickle'.format(basepath), 'wb+') as f:
    f.write(pickle.dumps(binarizer))

In [8]:
vgg_conv = vgg16.VGG16(weights='imagenet',
                       include_top=False,
                       input_shape=(256, 256, 3))

In [9]:
with file_io.FileIO('{}metadata.json'.format(basepath), 'wb+') as f:
    f.write(json.dumps({
        'input_dimention': 8 * 8 * 512,
        'train_samples_count': len(y_train),
        'test_samples_count': len(y_test),
        'classes_count': len(binarizer.classes_)
    }))

In [10]:
chunck_size = 100

In [11]:
images_features = np.array([])

def create_record(features, label):
    features = tf.train.Features(feature={
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=label)),
        'features': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features.tobytes()]))
    })
    
    return tf.train.Example(features=features)

def read_image(path):
    filename = 'gs://bigdata-allanbatista-com-br/{}'.format(path)
    with file_io.FileIO(filename, 'rb') as file:
        image = imread(file)
    
    return image
    
def create_and_write_record(data, x, y, dest_path):
    i = data[0]
    index = data[1]
    start = dt.now()
    filename = "{}{}.tfrecord".format(dest_path, str(index).zfill(5))
    
    with Pool(10) as p:
        images = np.array(p.map(read_image, x[i:i+chunck_size]))
    
    features = vgg_conv.predict(images)
    features = np.reshape(features, (len(features), 8 * 8 * 512))
    
    with tf.python_io.TFRecordWriter(filename) as writer:
        for feature, label in zip(features, y[i:i+chunck_size]):
            record = create_record(feature, label)
            writer.write(record.SerializeToString())            

    print("diff %ds: %s" % ((dt.now() - start).total_seconds(), filename))

    
def create_dataset_to_gs(x, y, dest_path):
    chunck_i = list(range(0, len(y), chunck_size))
    chunck_index = list(range(len(chunck_i)))

    for data in list(zip(chunck_i, chunck_index)):
        create_and_write_record(data, x, y, dest_path)

In [12]:
create_dataset_to_gs(train_paths, y_train, "{}trainset/".format(basepath))

diff 146s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00000.tfrecord
diff 147s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00001.tfrecord
diff 92s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00002.tfrecord
diff 79s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00003.tfrecord
diff 79s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00004.tfrecord
diff 79s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00005.tfrecord
diff 81s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00006.tfrecord
diff 80s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00007.tfrecord
diff 79s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00008.tfrecord
diff 82s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/trainset/00009.tfrecor

In [13]:
create_dataset_to_gs(test_paths, y_test, "{}testset/".format(basepath))

diff 82s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00000.tfrecord
diff 74s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00001.tfrecord
diff 75s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00002.tfrecord
diff 76s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00003.tfrecord
diff 79s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00004.tfrecord
diff 78s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00005.tfrecord
diff 74s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00006.tfrecord
diff 75s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00007.tfrecord
diff 75s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00008.tfrecord
diff 74s: gs://bigdata-allanbatista-com-br/image-classifier/20181118_165025/testset/00009.tfrecord
diff 76s: 