In [1]:
import matplotlib.pylab as plt
import numpy as np
import tensorflow as tf
from PIL import Image
import os
import copy

%matplotlib inline

plt.rcParams['figure.figsize'] = (10, 10)

# Opening the Images

Since we cannot open all of the images at one time, we will need a quick method to help us open the images in 'batches'. Below, I have written a function to easily open images and resize them to the specified shape. The function below will also return the label of the image - whether it is a cat (1) or a dog (0).

In [2]:
def open_images(image_list, path='', shape=(50, 50), return_labels=False):
    """
    Open a list of images, resize them all to a specified shape.
    This will also return the 'label' of the 
    
    Parameters
    ----------
    image_list: list
    path: string
    shape: tuple
    return_labels: bool, default is False
    
    Returns
    -------
    images: np.array
    labels: (optional)
    """
    images = []
    labels = []
    for fname in image_list:
        if 'cat' in fname:
            labels.append(1)
        else:
            labels.append(0)    
        img = Image.open(os.path.join(path, fname))
        img = img.resize((shape[0], shape[1]), Image.ANTIALIAS)
        smpl_img = np.asarray(img)
        images.append(smpl_img)
    
    labels = np.array(labels)
    cats = labels == 1
    dogs = labels == 0
    labels = np.array(np.stack((cats, dogs), axis=1), dtype=int)
    if return_labels:
        return np.array(images), labels
    else:
        return np.array(images)

# Batching

We need a method for getting images from the list in 'batches' since we cannot open all of the images at one time. Below I have created a simple method to help us with that. It takes the full image list and returns a uniformly chosen random set of image names from the list. You can specify the size of the batch in the call.

In [3]:
# class Batch:
#     """
#     Get a uniformly random selection of your list, with a specified size.
#     """
#     def __init__(self, image_list, batch_size=128):
#         self.i = 0
#         self.n = len(image_list)
#         self.image_list = image_list
#         self.batch_size = batch_size

#     def __iter__(self):
#         return self
    
#     def next(self):
#         idxs = np.random.randint(0, len(self.image_list), size=self.batch_size)
#         return self.image_list[idxs]
    
class Batch:
    """
    Get a uniformly random selection of your list, with a specified size.
    """
    def __init__(self, image_list, batch_size=128):
        self.n = len(image_list)
        self.image_list = image_list
        self.batch_size = batch_size
        self.indexes = np.arange(self.n)
        self.probs = np.ones(self.n)/self.n

    def __iter__(self):
        return self

    def reset_probabilities(self):
        self.probs = np.ones(self.n)*(1/self.n)
    
    def update_probabilities(self, idx):
        self.probs[idx] = 0.0
        zero_prob_mask = (self.probs == 0.0)
        self.probs = np.ones(self.n)/np.sum(np.invert(zero_prob_mask))
        self.probs[zero_prob_mask] = 0.0
        return
    
    def next(self):
        pmask = self.probs > 0.0
        if np.sum(pmask) >= self.batch_size:
            idxs = np.random.choice(
                self.indexes, 
                size=self.batch_size, 
                replace=False, 
                p=self.probs)
        else:
            self.reset_probabilities()
            idxs = np.random.choice(
                self.indexes, 
                size=self.batch_size, 
                replace=False, 
                p=self.probs)
        self.update_probabilities(idxs)
        return self.image_list[idxs]

# Permuting (flipping) Images

To artificially increase the size of the training image set, we can permute the images in the dataset by flipping or rotating the images, increasing their contrast or brightness. For this I will write a function that randomly flips random images in the training data. This will be done a few times for each batch. The flipped images will be chosen uniformly at random from the current training set. 

In [4]:
def flip_images(inputs, flip_sample_size=0.5):
    """
    Uniformly at random flip a fraction of the images.
    
    Parameters
    ----------
    inputs: np.array
        vector of images
    flip_sample_size: float
        fraction of images to flip - (0., 1.].
        
    Returns
    -------
    inputs: np.array
        vector of images with random sample flipped.
    """
    inputs = copy.deepcopy(inputs)
    n_images = inputs.shape[0]
    flip_size = int(np.round(flip_sample_size*n_images))
    idx = np.random.choice(np.arange(n_images), size=flip_size, replace=False)
    inputs[idx] = np.fliplr(inputs[idx])
    return inputs


# Tensorflow: A Convolutional Neural Network

The majority of this code is referenced from the tutorial on the TensorFlow website about <a href='https://www.tensorflow.org/get_started/mnist/pros'> convolutional neural networks </a>.

Convolutional neural networks are nice for images as they allow you to reduce the number of coefficients for each layer. In a fully connected neural network you can get weight layers that can be huge, in a convolutional neural network your weight layers are much smaller. Arguably, this gives you fewer places to get lost because there are much fewer coefficients.


## 2D Convolution

We need a basic function to perform the 2 dimensional convolution. This convolves image the weights and adds the bias. There is also a stride option, where you can change how much the convolutional filter moves, for this we use 1 pixel. Finally it puts this through the activation function (relu).

In [5]:
def conv2d(x, W, b, strides=1):
    """
    Performs the 2d convolution, adds the bias and does the relu.
    
    Parameters
    ----------
    x: tf.placeholder
    W: tf.variable
    b: tf.variable
    strides: int
    
    Returns
    -------
    x: tf.placeholder
        convolution with layer weights and bias added
    """
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding="SAME")
    return tf.nn.relu(x + b) ## add the bias and take the relu

## 2D Maxpool

After convolution we need to do some pooling. Pooling can reduce the size of the convolution.

In [6]:
def maxpool2d(x, k=2):
    """
    Pool the data after it has been convolved. This will downsample the
    image by k. If k is 2 then a 28x28 will be a 14x14 image.
    
    Parameters
    ----------
    x: tf.placeholder
    k: int
    
    Returns
    -------
    x: tf.placeholder
        convolution layer
    """
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], 
                              strides=[1, k, k, 1],
                              padding='SAME')

## Convolutional Neural Network

Finally we need to put the pieces above together to create our CNN. For this classification we will have 4 convolutional layers, that will go into a fully connected layer and be used to get the precdiction.

In [7]:
def cnn_model_fn(x, W, b, dropout):
    """
    The convolutional neural network. Takes your data, weights, biases
    and dropout. All of these need to be TensorFlow variables or
    placeholders. To avoid have a bunch of arguments, pass the weights
    and biases for each layer in in dictionaries with keys
    
    0 - first layer
    1 - second layer
    .
    .
    .
    fc - full connected layer
    out - output layer
    parity_out - parity output layer (odd/even label)
    
    This function will alter the final layer to allow us to predict the
    oddness/evenness (parity) of the integer. I will also keep the integer
    classification to train on, then with little training we can use
    the upstream layers to predict the parity. We can then just later 
    optimize on the finaly 'parity_out' layer to get the classification
    for odd or even.
    
    Parameters
    ----------
    x : tf.placeholder
    W : dictionary of tf.variables (keys above)
    b : dictionary of tf.variables (keys above)
    dropout: tf.placeholder
    img_shape: tuple
    
    Returns
    -------
    pred: tf method to get the predicitons for the digit.
    parity_pred: tf method to get the predicitons for the parity. 
    """
    
    ## convolve and pool for the first layer, downsample by 2
    conv1 = conv2d(x, W['0'], b['0'])
    conv1 = maxpool2d(conv1, k=2)
    
    ## convolve and pool for the second layer, downsample by 2
    conv2 = conv2d(conv1, W['1'], b['1'])
    conv2 = maxpool2d(conv2, k=2)
    
    ## convolve and pool for the third layer, downsample by 2
    conv3 = conv2d(conv2, W['2'], b['2'])
    conv3 = maxpool2d(conv3, k=2)
    
    ## convolve and pool for the third layer, downsample by 2
    conv4 = conv2d(conv3, W['3'], b['3'])
    conv4 = maxpool2d(conv4, k=2)

    ## create the fully connected layer, and reshape the 
    ## second conv layer
    fc = tf.reshape(conv4, [-1, W['fc'].get_shape().as_list()[0]])
    fc = tf.add(tf.matmul(fc, W['fc']), b['fc'])
    fc = tf.nn.relu(fc)
    
    ## apply the dropout
    fc = tf.nn.dropout(fc, dropout)
    
    ## evaluate the final layer that gives the prediction for the class
    pred = tf.matmul(fc, W['out']) + b['out']
    
    return {'pred': pred, "conv1": conv1, "conv2": conv2, "conv3": conv3, "conv4": conv4}

## Parameters

### Learning Rate

This is very important and will most likely need to be messed with. I have found 0.1 to work well with other data sets.

### Training Iterations

Specify the number of training iterations, the more we do the more the training accuracy will increase.

### Second Training Iterations

I will train mulitple times on the same batch, but will also do random perumations of the images in the second training session. This parameter specifies the number of times you want to re-train, with permutations, on the same batch.

### Batch Size

This can be messed with but typically I have found around 100 to be pretty good.

### Image Shape

The size of the images when resizing them to a constant size (upon opening).

### Number of Labels

The number of classes you will be labeling. In this case it is binary as we are labeling an image as either a cat or a dog.

### Dropout

Prevents overfitting.

In [8]:
learning_rate = 0.01
train_iter = 10000
second_train_iter = 5
batch_size = 100

img_shape = (64, 64)
# img_shape = (50, 50)
n_labels = 2
dropout = 0.5

## Placeholders

We need TensorFlow placeholders for the images and the labels. These are a specified type and shape. For instance we know the shape of the images but we don't know how many there will be. So we have the shape [None (number of images, image height, image width, and number of channels]. 

Then we create a placeholder for the labels. Which is the number of images by the number of labels (2).

The keep probability is also known as the dropout.

In [9]:
## placeholders
x = tf.placeholder(tf.float32, [None, img_shape[0], img_shape[1], 3])
y = tf.placeholder(tf.float32, [None, n_labels])
keep_prob = tf.placeholder(tf.float32)

## Variables

Finally we need some variables for the items in the layers, the weights and biases. Here I have some dictionaries of the layers with keys corresponding to the layers. The numbers correspond to the layer, 'fc' is the fully connected layer and 'out' is the layer that will contain the labels.

Then the biases have the shapes of the last dimension in each layer.

When the image size is 120 you need a fully connected layer that is 8*8*256 or 

Should be $\frac{image_{shape}[0]}{2^{number \ of \ layers}}$

In [10]:
# weights = {
#     # 3x3 conv, 3 (rgb) input, 32 'channels' output
#     '0': tf.Variable(tf.random_normal([3, 3, 3, 32])),
#     '1': tf.Variable(tf.random_normal([3, 3, 32, 64])),
#     '2': tf.Variable(tf.random_normal([3, 3, 64, 128])),
#     '3': tf.Variable(tf.random_normal([3, 3, 128, 256])),
#     'fc': tf.Variable(tf.random_normal([4*4*256, 1024])),
#     'out': tf.Variable(tf.random_normal([1024, n_labels]))
# }

weights = {
    # 3x3 conv, 3 (rgb) input, 32 'channels' output
    '0': tf.Variable(tf.random_normal([5, 5, 3, 32])),
    '1': tf.Variable(tf.random_normal([5, 5, 32, 64])),
    '2': tf.Variable(tf.random_normal([5, 5, 64, 128])),
    '3': tf.Variable(tf.random_normal([5, 5, 128, 256])),
    'fc': tf.Variable(tf.random_normal([4*4*256, 1024])),
    'out': tf.Variable(tf.random_normal([1024, n_labels]))
}

biases = {
    '0': tf.Variable(tf.random_normal([32])),
    '1': tf.Variable(tf.random_normal([64])),
    '2': tf.Variable(tf.random_normal([128])),
    '3': tf.Variable(tf.random_normal([256])),
    'fc': tf.Variable(tf.random_normal([1024])),
    'out': tf.Variable(tf.random_normal([n_labels])),
}

## Convolutional Neural Network Model

Set up the 'cnn' function with your placeholders and variables. Then define the cost function and pick the appropriate optimizer, I found the Adam Optimizer to be popular in many tutorials. Then define a method for looking at your training accuracy.

Finally, we need a function to get the labels out. The pred will return the probabilities for each label, so we just take the argmax of each set of labels (e.g. [0.3, 0.7] or some numbers, not necessarily between 0 and 1) to determine whether it is a cat or dog.

In [11]:
## Construct model
cnn_model = cnn_model_fn(x, weights, biases, keep_prob)
pred = cnn_model['pred']

## Define loss and optimizer 
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

## functions to get the labels out
ypred = tf.argmax(pred, 1)

## Training!!!

In the second training session on the batch randomly flip 20% of the images.


call the optimizer function. This knows about the functions up-
stream and will update variables we just need to pass in the 
placeholders or in this case training data and dropout
this is where the training happens. All weights and biases are updated here.

sess.run(optimizer, feed_dict={x: batch_x, y: batch_y, keep_prob: dropout})


Try: training on the same batch multiple times...

Set learning rate lower...

In [12]:
train_path = 'data/train/'
image_list = np.array(os.listdir(train_path))

train_image_list = image_list[:-500]
test_image_list = image_list[-500:]

In [None]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

batch = Batch(train_image_list, batch_size)

step = 1
while step*batch_size < train_iter:
    ## get a batch of data -- use built in next function
    batch_img_names = batch.next()
    
    batch_x, batch_y = open_images(batch_img_names, path=train_path, shape=img_shape, return_labels=True)
    sess.run(optimizer, feed_dict={x: batch_x, y: batch_y, keep_prob: dropout})
    
    for i in range(second_train_iter):
        inputs = flip_images(batch_x, flip_sample_size=0.4)
        sess.run(optimizer, feed_dict={x: inputs, y: batch_y, keep_prob: dropout})
    
    ## print the accuracy mod 10 steps
    if step % 10 == 0:
        acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y, keep_prob: dropout})
        print("Iteration : {}, Training Accuracy: {}".format(step*batch_size, acc))
    
    step += 1

acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y, keep_prob: dropout})
print("Iteration : {}, Training Accuracy: {}".format(step*batch_size, acc))

In [None]:
# sess.close()

In [None]:
test_x, test_y = open_images(test_image_list, path=test_path, shape=img_shape, return_labels=True)

acc = sess.run(accuracy, feed_dict={x: test_x, y:test_y, keep_prob: dropout})
print("Testing Accuracy: {}".format(acc))