## Tensorflow for machine learning
* writing images to tfrecords (binary) files for faster read in learning step

In [8]:
import glob
import os
import tensorflow as tf
from itertools import groupby
from collections import defaultdict

In [4]:
#image_filenames = glob.glob('./imagenet-dogs/Images/n02*/*.jpg')
image_filenames = glob.glob('/media/mac/winstorage/Images/Images/n02*/*.jpg')

In [6]:
image_filenames[0:2]

['/media/mac/winstorage/Images/Images/n02085620-Chihuahua/n02085620_10074.jpg',
 '/media/mac/winstorage/Images/Images/n02085620-Chihuahua/n02085620_10131.jpg']

In [9]:
train_data = defaultdict(list)
test_data = defaultdict(list)

In [10]:
# image_filename_with_breed = map(lambda x: (x.split("/")[3], x.split("/")[4]), image_filenames)
image_filename_with_breed = map(lambda x: (x.split("/")[6], x), image_filenames)

In [11]:
image_filename_with_breed[0:4]

[('n02085620-Chihuahua',
  '/media/mac/winstorage/Images/Images/n02085620-Chihuahua/n02085620_10074.jpg'),
 ('n02085620-Chihuahua',
  '/media/mac/winstorage/Images/Images/n02085620-Chihuahua/n02085620_10131.jpg'),
 ('n02085620-Chihuahua',
  '/media/mac/winstorage/Images/Images/n02085620-Chihuahua/n02085620_10621.jpg'),
 ('n02085620-Chihuahua',
  '/media/mac/winstorage/Images/Images/n02085620-Chihuahua/n02085620_1073.jpg')]

In [12]:
for dog_breed, breed_images in groupby(image_filename_with_breed, lambda x: x[0]):
        for i, breed_image in enumerate(breed_images):
            if i % 5 == 0:
                test_data[dog_breed].append(breed_image)
            else:
                train_data[dog_breed].append(breed_image)

In [14]:
#test_data.keys()
test_data['n02086910-papillon'][0:2]

[('n02086910-papillon',
  '/media/mac/winstorage/Images/Images/n02086910-papillon/n02086910_10147.jpg'),
 ('n02086910-papillon',
  '/media/mac/winstorage/Images/Images/n02086910-papillon/n02086910_1199.jpg')]

In [None]:
# train_data.keys()

In [20]:
# for breed, images_filenames in test_data.items():
#     for image_filename in images_filenames:            
#         print breed, ' | ', image_filename[1]

In [24]:
def write_records_file(dataset, record_location):

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    writer = None

    current_index = 0
    
    for breed, images_filenames in dataset.items():
        
        print breed, current_index
        
        for image_filename in images_filenames:
            
            if current_index % 100 == 0:
                if writer:
                    writer.close()

                record_filename = "{record_location}-{current_index}.tfrecords".format(
                    record_location=record_location,
                    current_index=current_index)

                writer = tf.python_io.TFRecordWriter(record_filename)

            current_index += 1

            image_file = tf.read_file(image_filename[1])

            try:
                image = tf.image.decode_jpeg(image_file)
            except:
                print(image_filename)
                continue

            # convert to gray scale and resize images
            grayscale_image = tf.image.rgb_to_grayscale(image)        
            resized_image = tf.image.resize_images(grayscale_image, size = [250, 151])

            image_bytes = sess.run(tf.cast(resized_image, tf.uint8)).tobytes()            
            image_label = breed.encode('utf-8')

            example = tf.train.Example(features=tf.train.Features(feature={
                        'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_label])),
                        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_bytes]))
                    }))

            writer.write(example.SerializeToString())

    writer.close()

In [None]:
write_records_file(train_data, "/media/mac/winstorage/output/training-images/testing-image")

n02086910-papillon 0
n02087394-Rhodesian_ridgeback 156
n02112018-Pomeranian 293
n02088094-Afghan_hound 468
n02110063-malamute 659


In [None]:
# runf function for both data files
write_records_file(test_data, "/media/mac/winstorage/output/testing-images/testing-image")

In [None]:
# for breed, images_filenames in test_data.items():
#     if breed == 'n02086910-papillon':
#         for image_filename in images_filenames:
#             print image_filename

In [None]:
# k = train_data.keys()[0]
# train_data[k][0][1]

In [None]:
# t1 = (1,2)
# t2 = (3,4)
# from itertools import product
# for a, b in product(t1,t2):
#     print a,b