In [1]:
import numpy as np
import pandas as pd
import io
import bson
import matplotlib.pyplot as plt
from skimage.data import imread
import multiprocessing as mp
import pickle
import tensorflow as tf

In [2]:
NCORE = 2

all_categories_array = np.array([])

#categories to int dictionary
categ_to_int = {}
int_to_categ = {}

#total number of items in the list
n_train = 7069896 #from kaggle page
n_test = 1768182 #from kaggle page
n_example = 100 #from kaggle page

all_categories_filename_format = 'allcategoriesdata_{0}.p'
train_data_batch_file_format = 'training_batches/{0}/train_{0}_{1}_{2}.jpeg'
test_data_batch_file_format = 'testing_batches/{0}/test_{0}_{1}_{2}.jpeg'

train_category_folder_path_format = 'training_batches/{0}'
test_category_folder_path_format = 'testing_batches/{0}'
test_category_folder_name_format = 'folder_{0}'

show_every = 10000

mini_batch = 1000

In [3]:
import time
import os.path

In [4]:
def load_categ_to_int_dicts(data_file_path):
    """
    restores categ_to_int and int_to_categ object dictionaries from saved state files if exist
    : data_file_path: actual data file path - to represent the mode (train or train example)
    """
    process_filename = data_file_path[data_file_path.rfind('/')+1:]
    filename_suffix = process_filename.replace('.bson','')
    categories_filename = all_categories_filename_format.format(filename_suffix)
    
    with open(categories_filename, 'rb') as f:
        
        global categ_to_int, int_to_categ
        
        categ_to_int, int_to_categ = pickle.load(f)

In [5]:
def create_one_hot_label(original_label, label_length, one_hot_labels):
    """
    creates one hot label for a given original label value. A sub function for multi core processing of one hot encode function
    : label_length: length of label to initialize the array
    : one_hot_labels: the array that contains all one hot label
    : return: void
    """
    one_hot_label = np.zeros(label_length, dtype='int16')
    one_hot_label[categ_to_int[original_label]] = 1
    one_hot_labels.append(one_hot_label)

def one_hot_encode(data_batch, n_classes):
    """
    creates one hot encoded label for the given data batch using multi-core processing
    : data_batch: the sub-section of original final training data
    : return: array of one hot encoded label
    """
    one_hot_labels = list()
    label_length = n_classes #len(categ_to_int)
    #print(data_batch)
    for i in range(len(data_batch)):
        original_label = int(data_batch[i]) # 0 - category column
        create_one_hot_label(original_label, label_length, one_hot_labels)

    one_hot_labels = np.array(list(one_hot_labels))
    return one_hot_labels

In [6]:
#Load dictionaries - categ_to_int and int_to_categ from files to objects
load_categ_to_int_dicts('data/train.bson')

In [7]:
len(categ_to_int)

5270

In [8]:
def normalize(x):
    """
    Normalize a list of sample image data in the range of 0 to 1
    : x: List of image data.  The image shape is (180, 180, 3)
    : return: Numpy array of normalize data
    """
    xmax = 255 #image max value
    return x.astype(np.float)/float(xmax)


In [9]:
data_dir = 'training_batches/'
contents = os.listdir(data_dir)
classes = [each for each in contents if os.path.isdir(data_dir + each)]

In [10]:
def load_image(path):
    """
    loads the image from the given path, crops if it is not 180x180 and returns the image data
    : path: image file path
    : returns: resized image data
    """
    img = imread(path)
    img = img / 255.0
    if (img.shape[0] == 180) & (img.shape[1] == 180):
        return img
    else:
        short_edge = min(img.shape[:2])
        yy = int((img.shape[0] - short_edge) / 2)
        xx = int((img.shape[1] - short_edge) / 2)
        crop_img = img[yy: yy + short_edge, xx: xx + short_edge]
        # resize to 180, 180
        resized_img = skimage.transform.resize(crop_img, (180, 180), mode='constant')
        return resized_img

'''Test Method below'''
#load_image('training_batches/1000000237/train_1000000237_12600_0.jpeg')
print('Image Load function done')

Image Load function done


In [11]:
file_label_mapping = 'file_label_mapping.p'
file_path_format = '{0}/{1}/{2}'
label_folder_format = '{0}/{1}'

def fetch_filenames_labels_train(folder_path):
    """
    fetches all the filenames and their labels and dumps them into a pickle files
    : folder_path: path of the parent folder
    : returns: void
    """
    contents = os.listdir(folder_path)
    all_labels = [each for each in contents if os.path.isdir(label_folder_format.format(folder_path, each))]
    labels = list()
    inputs = list()
    for label_folder in all_labels:
        img_files = os.listdir(label_folder_format.format(folder_path, label_folder))
        if len(img_files) < 10:
            continue
        inputs.extend([file_path_format.format(folder_path, label_folder, each) for each in img_files])
        labels.extend([label_folder for each in img_files])
    pickle.dump((inputs, labels), open(file_label_mapping, 'wb'))

In [12]:
fetch_filenames_labels_train('training_batches')

In [12]:
def restore_filenames_labels_train(file_path):
    """
    loads the pickle file that has information of image file paths and their respective labels 
    : file_path: pickle file path
    : returns: inputs (file paths) and labels
    """
    if(os.path.exists(file_path)):
        with open(file_path, 'rb') as f:
            inputs, labels = pickle.load(f)
            return inputs, labels

In [13]:
inputs, labels = restore_filenames_labels_train(file_label_mapping)

In [14]:
from collections import Counter
labels_ctr = Counter(labels)
n_classes_considered = len([ e for e,c in labels_ctr.most_common() if c >= 10])
n_classes_considered

3940

In [15]:
inputs_array = np.array(inputs)
labels_array = np.array(labels)

In [16]:
from sklearn.model_selection import StratifiedShuffleSplit

def get_training_val_test_sets(inputs_array, labels_array):
    
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.4)
    train_idx, val_idx = next(splitter.split(inputs_array, labels_array))

    half_val_len = int(len(val_idx)/2)
    val_idx, test_idx = val_idx[:half_val_len], val_idx[half_val_len:]

    train_x, train_y = inputs_array[train_idx], labels_array[train_idx]
    val_x, val_y = inputs_array[val_idx], labels_array[val_idx]
    test_x, test_y = inputs_array[test_idx], labels_array[test_idx]
    
    result = [[train_x, train_y], [val_x, val_y], [test_x, test_y]]
    
    return result

In [17]:
#inspired by resnet50 (infact, trying to recreate resnet50)
def identity_block(input_tensor, kernel_size, filters, stage, block):
    """
    creates an identity block. Identity layer is a layer that has no conv layer at shortcut
    : input_tensor: input tensor
    : kernel_size: default 3, kernel size of the middle layer
    : filters: list of integers, filter sizes of three conv layers
    : stage: current stage, integer, used for creating names
    : block: current block, character, used for creating names
    """
    filter1, filter2, filter3 = filters
    bn_axis = 3
    
    conv_name = 'res_{0}_{1}_branch_'.format(str(stage), block)
    bn_name = 'bn_{0}_{1}_branch_'.format(str(stage), block)
    
    #kernel_size is fed, strides=(1,1) default, padding=valid default
    x = tf.layers.conv2d(input_tensor, filter1, (1,1), name=conv_name + '2a') 
    x = tf.layers.batch_normalization(x, axis=bn_axis, name=bn_name +'2a')
    x = tf.nn.relu(x)
    
    x = tf.layers.conv2d(x, filter2, kernel_size=kernel_size, padding='same', name= conv_name + '2b')
    x = tf.layers.batch_normalization(x, axis=bn_axis, name=bn_name +'2b')
    x = tf.nn.relu(x)
    
    x = tf.layers.conv2d(x, filter3, (1,1), name=conv_name + '2c')
    x = tf.layers.batch_normalization(x, axis=bn_axis, name=bn_name +'2c')
    
    x = tf.add(x, input_tensor) #short cut connection
    x = tf.nn.relu(x)
    return x

def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
    """
    creates an conv block. Conv block layer is a layer that has a conv layer at shortcut
    : input_tensor: input tensor
    : kernel_size: default 3, kernel size of the middle layer
    : filters: list of integers, filter sizes of three conv layers
    : stage: current stage, integer, used for creating names
    : block: current block, character, used for creating names
    : strides: strides that kernels take
    """
    filter1, filter2, filter3 = filters
    bn_axis = 3
    
    conv_name = 'res_{0}_{1}_branch_'.format(str(stage), block)
    bn_name = 'bn_{0}_{1}_branch_'.format(str(stage), block)
    
    x = tf.layers.conv2d(input_tensor, filter1, (1,1), strides=strides, name=conv_name + '2a') #applied strides
    x = tf.layers.batch_normalization(x, axis=bn_axis, name=bn_name +'2a')
    x = tf.nn.relu(x)
    
    x = tf.layers.conv2d(x, filter2, kernel_size, padding='same', name=conv_name + '2b')
    x = tf.layers.batch_normalization(x, axis=bn_axis, name=bn_name +'2b')
    x = tf.nn.relu(x)
    
    x = tf.layers.conv2d(x, filter3, (1,1), padding='same', name=conv_name + '2c')
    x = tf.layers.batch_normalization(x, axis=bn_axis, name=bn_name +'2c')
    
    shortcut = tf.layers.conv2d(input_tensor, filter3, (1,1), strides=strides, name=conv_name + '1')
    shortcut = tf.layers.batch_normalization(shortcut, axis=bn_axis, name=bn_name +'1')
    
    x = tf.add(x, shortcut) #short cut connection
    x = tf.nn.relu(x)
    return x

    

In [44]:
def build_inputs_labels(image_shape, n_classes):
    #prepare input tensors
    inputs = tf.placeholder(tf.float32, shape=[None, *image_shape], name='inputs')
    labels = tf.placeholder(tf.float32, shape=[None, n_classes], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    return inputs, labels
    
#inspired by resnet50 (infact, trying to recreate resnet50)
def build_model(inputs, labels, n_classes):
    bn_axis = 3
    #prepare model
    
    #paddings = [[1,0],[2,3], [3,3], [4,0]] #(3,3) zero padding 2d in keras
    #x = tf.pad(inputs, paddings)
    
    x = tf.layers.conv2d(inputs, 64, (7,7), strides=(2,2), name='conv1')
    x = tf.layers.batch_normalization(x, axis=bn_axis, name='bn_conv1')
    x = tf.nn.relu(x)
    x = tf.layers.max_pooling2d(x, (3,3), strides=(2,2))
    
    x = conv_block(x, 3, [64,64,256], stage=2, block='a', strides=(1,1))
    x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
    x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
    
    x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
    
    x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
    
    x = conv_block(x, 3, [512,512,2048], stage=5, block='a')
    x = identity_block(x, 3, [512,512,2048], stage=5, block='b')
    x = identity_block(x, 3, [512,512,2048], stage=5, block='c')
    
    x = tf.layers.average_pooling2d(x, (5,5), strides=(5, 5), name='avg_pool') #changed from (7,7) from original res50
    
    #flattening
    image_size = x.get_shape()[1:].num_elements()
    x = tf.reshape(x, [-1, image_size])
    
    logits = tf.layers.dense(x, n_classes, activation=tf.nn.relu, name='fc5270')
    
    return logits

In [19]:
def get_batches(x, y, batch_size):
    """
    splits x and y into n batches based on batch_size
    : x: features set
    : y: labels set
    : batch_size: size of each batch 
    : returns: batches
    """
    total_count = len(x)
    batch_count = int(np.ceil(total_count / batch_size))
    batches = list()
    for idx in range(batch_count):
        x_batch, y_batch = [],[]
        if idx == batch_count-1:
            x_batch = x[idx*batch_size:]
            y_batch = y[idx*batch_size:]
        else:
            x_batch = x[idx*batch_size: (idx+1)*batch_size]
            y_batch = y[idx*batch_size: (idx+1)*batch_size]
        batches.append([x_batch, y_batch])
    return batches

In [20]:
def get_image_data_for_batch(x_paths):
    """
    gets image data in 180x180x3 shape for given paths
    : x_paths: list of paths to actual image files
    : returns: np array of image data
    """
    x_batch = []
    for path in x_paths:
        img = load_image(path)
        x_batch.append(img)
    return np.array(x_batch)

In [21]:
'''def get_model_output(inputs, labels, learning_rate):
    
    
    return logits, cost, optimizer, accuracy'''

'def get_model_output(inputs, labels, learning_rate):\n    \n    \n    return logits, cost, optimizer, accuracy'

In [22]:
# get training sets, validation sets and test sets
save_model_path = './saved_model'
[train_x, train_y], [val_x, val_y], [test_x, test_y] = get_training_val_test_sets(inputs_array, labels_array) 

In [57]:
def train(train_x, train_y, val_x, val_y):
    
    epochs = 4
    batch_size  = 64
    learning_rate = 0.0001
    keep_probability = 0.75
    
    tf.reset_default_graph()
    
    with tf.Session() as sess:
        
        image_shape = [180, 180, 3]
        n_classes =  len(categ_to_int) #n_classes_considered #3970 #len(categ_to_int) #5270
        
        inputs, labels = build_inputs_labels(image_shape=image_shape, n_classes=n_classes)
    
        logits = build_model(inputs, labels, n_classes)
        logits = tf.identity(logits, name='logits') #assigning a name
        
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
        correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
    
        # Optimizer, Accuracy
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name='accuracy')
    
        # get training batches
        batches = get_batches(train_x, train_y, batch_size=batch_size)
        
        sess.run(tf.global_variables_initializer())
        for epoch in range(epochs):
            batch_i = 0
            for batch in batches:
                x_batch_paths = batch[0]
                y_batch_labels = batch[1]
                
                batch_i += 1
                print('Epoch: {}, Batch: {},'.format(epoch + 1, batch_i), end='')                
                
                x_batch = get_image_data_for_batch(x_batch_paths)
                y_batch = one_hot_encode(data_batch=y_batch_labels, n_classes=n_classes)
                
                print('Executing.') 
                loss, opt = sess.run([cost, optimizer], feed_dict={inputs:x_batch, labels:y_batch})
                
                #print('Loss : {}, '.format(loss), end='')
                print('Loss : {}, '.format(loss))
                
                val_x_paths = val_x[:batch_size]
                val_y_labels = val_y[:batch_size]

                val_x_batch = get_image_data_for_batch(val_x_paths)
                val_y_batch = one_hot_encode(data_batch=val_y_labels, n_classes=n_classes)

                accuracy_out = sess.run(accuracy, feed_dict={inputs:val_x_batch, labels:val_y_batch})

                print('accuracy : {}, '.format(accuracy_out))
        
        saver = tf.train.Saver()
        save_path = saver.save(sess, save_model_path)
        
    print('training finished.')

In [49]:
def test(test_x, test_y):
    batch_size  = 32

    # get training batches
    batches = get_batches(test_x, test_y, batch_size=batch_size)
    
    loaded_graph = tf.Graph()
    with tf.Session(graph=loaded_graph) as sess:
        
        n_classes =  len(categ_to_int) #n_classes_considered #3970 #len(categ_to_int) #5270
        # Load model
        loader = tf.train.import_meta_graph(save_model_path + '.meta')
        loader.restore(sess, save_model_path)
        
        # Get Tensors from loaded model
        loaded_x = loaded_graph.get_tensor_by_name('inputs:0')
        loaded_y = loaded_graph.get_tensor_by_name('labels:0')
        #loaded_keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
        loaded_logits = loaded_graph.get_tensor_by_name('logits:0')
        loaded_acc = loaded_graph.get_tensor_by_name('accuracy:0')
        
        test_batch_acc_total = 0
        
        batch_i = 0
        for batch in batches:
            x_batch_paths = batch[0]
            y_batch_labels = batch[1]

            batch_i += 1

            print('Batch: {},'.format(batch_i), end='')
            x_batch = get_image_data_for_batch(x_batch_paths)
            y_batch = one_hot_encode(data_batch=y_batch_labels, n_classes=n_classes)
            
            print('Executing.')

            accuracy_out = sess.run([loaded_acc], feed_dict={loaded_x:x_batch, loaded_y:y_batch})
            print('Batch Test Accuracy:{}'.format(accuracy_out))
            test_batch_acc_total += accuracy_out
        
        print('final accuracy:{}'.format(test_batch_acc_total/batch_i))

In [26]:
test_file_path_mapping = 'test_file_path_mapping.p'
test_file_path_format = '{0}/{1}/{2}'
test_intermediate_folder_format = '{0}/{1}'

test_data_dir = 'testing_batches/'

def fetch_filenames_predict_test(folder_path):
    """
    fetches all the filenames for final prediction test and dumps them into a pickle files
    : folder_path: path of the parent folder
    : returns: void
    """
    contents = os.listdir(folder_path)
    all_folders = [each for each in contents if os.path.isdir(test_intermediate_folder_format.format(folder_path, each))]
    test_inputs = list()
    for temp_folder in all_folders:
        img_files = os.listdir(test_intermediate_folder_format.format(folder_path, temp_folder))
        test_inputs.extend([test_file_path_format.format(folder_path, temp_folder, each) for each in img_files])
    pickle.dump((test_inputs), open(test_file_path_mapping, 'wb'))

In [27]:
fetch_filenames_predict_test(test_data_dir)

In [27]:
def restore_filenames_predict_test(file_path):
    """
    loads the pickle file that has information of image file paths and their respective labels 
    : file_path: pickle file path
    : returns: inputs (file paths) and labels
    """
    if(os.path.exists(file_path)):
        with open(file_path, 'rb') as f:
            inputs = pickle.load(f)
            return inputs

In [28]:
test_inputs = restore_filenames_predict_test(test_file_path_mapping)

In [29]:
def get_test_batches(x, batch_size):
    """
    splits x into n batches based on batch_size
    : x: test features set 
    : batch_size: size of each batch 
    : returns: batches
    """
    total_count = len(x)
    batch_count = int(np.ceil(total_count / batch_size))
    batches = list()
    for idx in range(batch_count):
        x_batch = []
        if idx == batch_count-1:
            x_batch = x[idx*batch_size:]
        else:
            x_batch = x[idx*batch_size: (idx+1)*batch_size]
        batches.append(x_batch)
    return batches

In [30]:
def predict_final_test(test_x):
    batch_size  = 1024

    # get training batches
    batches = get_test_batches(test_x, batch_size=batch_size)
    
    loaded_graph = tf.Graph()
    with tf.Session(graph=loaded_graph) as sess:
        # Load model
        loader = tf.train.import_meta_graph(save_model_path + '.meta')
        loader.restore(sess, save_model_path)
        
        # Get Tensors from loaded model
        loaded_x = loaded_graph.get_tensor_by_name('x:0')
        loaded_logits = loaded_graph.get_tensor_by_name('logits:0')
        
        test_batch_acc_total = 0
        
        batch_i = 0
        for batch in batches:
            x_batch_paths = batch[0]

            x_batch = get_image_data_for_batch(x_batch_paths)
            batch_i += 1

            print('Batch: {},'.format(batch_i), end='')
            logits_out = sess.run([loaded_logits], feed_dict={loaded_x:x_batch})
            pred = np.argmax(logits_out, axis=1)
            pred_to_categ = [int_to_categ(val) for val in pred]
            #TODO: need to fetch ids here and join it with pred and write to a file
            
        print('Done')

In [58]:
train(train_x, train_y, val_x, val_y)

Epoch: 1, Batch: 1,Executing.
Loss : 8.568848609924316, 
accuracy : 0.0, 
Epoch: 1, Batch: 2,Executing.
Loss : 8.565624237060547, 
accuracy : 0.0, 
Epoch: 1, Batch: 3,Executing.
Loss : 8.539942741394043, 


KeyboardInterrupt: 

In [61]:
len(train_x)

644382

In [75]:
categ_to_int[1000018402]

2849

In [117]:
len(categ_to_int)

5270