# PreProcessing Images to generate a Dataset

In [15]:
!pip install google-cloud-storage tensorflow --quiet

[31m  Could not find a version that satisfies the requirement cv2 (from versions: )[0m
[31mNo matching distribution found for cv2[0m


In [74]:
import os

from google.cloud import storage
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer
from datetime import datetime as dt
from multiprocessing import Pool
from skimage.io import imread
from tensorflow.keras.applications import vgg16
import numpy as np

from tensorflow.python.lib.io import file_io
import _pickle as pickle

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/jovyan/work/image-classifier/bigdata-217213-55b1dfc31b66.json'

In [3]:
basepath = 'gs://bigdata-allanbatista-com-br/trainer/{}/'.format(dt.now().strftime('%Y%m%d_%H%M%S'))
basepath

'gs://bigdata-allanbatista-com-br/trainer/20181110_162649/'

In [4]:
gs = storage.Client()
bucket = gs.bucket('bigdata-allanbatista-com-br')

def list_blobs(pattern):
    return [blob.name for blob in bucket.list_blobs(prefix=pattern)]

In [5]:
def list_images_with_labels(pattern):
    x = []
    y = []
    for path in list_blobs(pattern):
        x.append(path)
        y.append(path.split("/")[2])
    
    return x, y

In [6]:
paths, labels = list_images_with_labels("dataset/train")

In [42]:
binarizer = LabelBinarizer()
y_train = binarizer.fit_transform(labels)

In [43]:
with file_io.FileIO('{}binarizer.pickle'.format(basepath), 'wb+') as f:
    f.write(pickle.dumps(binarizer))

In [66]:
vgg_conv = vgg16.VGG16(weights='imagenet',
                       include_top=False,
                       input_shape=(256, 256, 3))

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [89]:
# filename = 'gs://bigdata-allanbatista-com-br/{}'.format(paths[0])
# with file_io.FileIO(filename, 'rb') as file:
#     image = imread(file)
    
# features_batch = vgg_conv.predict(np.array([image]))

# features = tf.train.Features(feature={
#     'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[1])),
#     'features': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features_batch[0].tobytes()]))
# })

# tf.train.Example(features=features)
# None

In [None]:
chunck_size = 10

def create_record(features, label):
    features = tf.train.Features(feature={
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=label)),
        'features': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features.tobytes()]))
    })
    
    return tf.train.Example(features=features)

def read_image(path):
    filename = 'gs://bigdata-allanbatista-com-br/{}'.format(path)
    print("\rread: %s" % filename)
    with file_io.FileIO(filename, 'rb') as file:
        image = imread(file)
    
    return image
    
def create_and_write_record(data, x, y, dest_path):
    i = data[0]
    index = data[1]
    start = dt.now()
    filename = "{}{}.tfrecord".format(dest_path, str(index).zfill(5))
    
    with Pool(10) as p:
        images = np.array(p.map(read_image, x[i:i+chunck_size]))
    
    print("predicting")
    features = vgg_conv.predict(images)
    features = np.reshape(features, (len(features), 8 * 8 * 512))
    
    print("writing in doc")
    with tf.python_io.TFRecordWriter(filename) as writer:
        for feature, label in zip(features, y[i:i+chunck_size]):
            record = create_record(feature, label)
            writer.write(record.SerializeToString())            

    print("diff %ds: %s" % ((dt.now() - start).total_seconds(), filename))

    
def create_dataset_to_gs(x, y, dest_path):
    chunck_i = list(range(0, len(y), chunck_size))
    chunck_index = list(range(len(chunck_i)))

    for data in list(zip(chunck_i, chunck_index)):
        print("create first chunck")
        create_and_write_record(data, x, y, dest_path)

    
create_dataset_to_gs(paths, y_train, "{}trainset/".format(basepath))

create first chunck
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/137077663_0b060d0f61_47_11376310@N00.jpg_square.jpg
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/139670903_b4f5ccfde8_48_25305687@N00.jpg_square.jpg
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/135915541_8c29fff756_47_46279749@N00.jpg_square.jpg
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/13678273393_7ea0af8afb.jpg_square.jpg
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/139735556_8b2e953a7c_55_15109233@N00.jpg_square.jpg
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/140427774_a71f6ab885_47_59528684@N00.jpg_square.jpg
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/13819455105_8ae437ae67.jpg_square.jpg
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/139150131_3e72c41180_56_86874036@N00.jpg_square.jpg
read: gs://bigdata-allanbatista-com-br/dataset/train/bluebell/118814378_d873c185ec_54_37726231@N