In [1]:
# import libraries
import tensorflow as tf
import os
import numpy as np
from scipy import misc
import sys
from random import shuffle
from random import uniform
import zipfile
from collections import OrderedDict
import glob
import time
from PIL import Image
from moviepy.editor import VideoFileClip
from tqdm import tqdm

%matplotlib inline
import matplotlib.pyplot as plt

## Prepare input data

In [2]:
gt_train_path = '/data2/cityscapes_dataset/gtFine/train'
imgs_train_path = '/data2/cityscapes_dataset/leftImg8bit/train'
gt_val_path = '/data2/cityscapes_dataset/gtFine/val'
imgs_val_path = '/data2/cityscapes_dataset/leftImg8bit/val'
gt_test_path = '/data2/cityscapes_dataset/gtFine/test'
imgs_test_path = '/data2/cityscapes_dataset/leftImg8bit/test'

# Get filenames of training data and gt, specific for cityscapes dataset
def get_files(imgs_dir, gt_dir):
    
    cities = os.listdir(imgs_dir)
    gt = []
    imgs = []
    for city in cities:
        new_gt_path = os.path.join(gt_dir, city)
        new_imgs_path = os.path.join(imgs_dir, city)
        gt += glob.glob(os.path.join(new_gt_path, "*labelIds.png"))
        imgs += glob.glob(os.path.join(new_imgs_path, "*.png"))
    imgs.sort()
    gt.sort()
    return imgs, gt

# Get filenames of training data and gt
train_imgs, train_gt = get_files(imgs_train_path, gt_train_path)
val_imgs, val_gt = get_files(imgs_val_path, gt_val_path)
test_imgs, test_gt = get_files(imgs_test_path, gt_test_path)

## Utility Functions

In [3]:
def plot_image(image_path=None, img=None, from_path=True):
    if (from_path == True):
        img = misc.imread(image_path)
    if len(img.shape) == 4:
        img = np.squeeze(img)
    if img.dtype != np.uint8:
        img = img.astype(np.uint8)
    plt.imshow(img)
    plt.show()

In [4]:
# prepare_ground_truth for cityscape data
def prepare_ground_truth(img):
    
    # Five classes: road, side_walk, pedestrian, vehicles, others
    NUM_CLASSES = 5
    new_image = np.zeros((img.shape[0], img.shape[1], NUM_CLASSES))
    
    # (original_id)
    # road
    road_mask = img == 7
    # sidewalk 
    side_mask = img == 8
    # pedestrians[person,rider]
    ped_mask = np.logical_or(img == 24, img == 25)
    # vehicles[car,truck,bus,caravan,trailer,train,motorcycle, bicycle, license plate]
    car_mask = np.logical_or.reduce((img == 26, img == 27, img == 28,
                                      img == 29, img == 30, img == 31,
                                      img == 32, img == 33, img == -1))
    # everything else
    else_mask = np.logical_not(np.logical_or.reduce((road_mask, side_mask,
                                                     ped_mask, car_mask)))
    
    new_image[:,:,0] = road_mask
    new_image[:,:,1] = side_mask
    new_image[:,:,2] = ped_mask
    new_image[:,:,3] = car_mask
    new_image[:,:,4] = else_mask
    
    return new_image.astype(np.float32)

In [7]:
# num_classes = 20, 19 objects classes plus one background glass
'''
def prepare_ground_truth_normal(img):
    
    # Five classes: road, side_walk, pedestrian, vehicles, others
    NUM_CLASSES = 20
    new_image = np.zeros((img.shape[0], img.shape[1], NUM_CLASSES))
    
    # (original_id)
    # road
    road_mask = img == 7
    # sidewalk 
    sidewalk_mask = img == 8
    
    building_mask = img == 11
    wall_mask = img == 12
    fence_mask = img == 13
    pole_mask = img == 17
    traffic_light_mask = img == 19
    traffic_sign_mask = img == 20
    vegetation_mask = img == 21
    terrain_mask = img == 22
    sky_mask = img == 23
    person_mask = img == 24
    rider_mask = img == 25
    car_mask = img == 26
    truck_mask = img == 27
    bus_mask = img == 28
    train_mask = img == 31
    motorcycle_mask = img == 32
    bicycle_mask = img == 33
    
    # everything else
    else_mask = np.logical_not(np.logical_or.reduce((road_mask, sidewalk_mask, building_mask, wall_mask,
                                                     fence_mask, pole_mask, traffic_light_mask, traffic_sign_mask,
                                                     vegetation_mask, terrain_mask, sky_mask, person_mask,
                                                     rider_mask, car_mask, truck_mask, bus_mask,
                                                     train_mask, motorcycle_mask, bicycle_mask)))


    new_image[:,:,0] = road_mask
    new_image[:,:,1] = sidewalk_mask
    new_image[:,:,2] = building_mask
    new_image[:,:,3] = wall_mask
    new_image[:,:,4] = fence_mask
    new_image[:,:,5] = pole_mask
    new_image[:,:,6] = traffic_light_mask
    new_image[:,:,7] = traffic_sign_mask
    new_image[:,:,8] = vegetation_mask
    new_image[:,:,9] = terrain_mask
    new_image[:,:,10] = sky_mask
    new_image[:,:,11] = person_mask
    new_image[:,:,12] = rider_mask
    new_image[:,:,13] = car_mask
    new_image[:,:,14] = truck_mask
    new_image[:,:,15] = bus_mask
    new_image[:,:,16] = train_mask
    new_image[:,:,17] = motorcycle_mask
    new_image[:,:,18] = bicycle_mask
    new_image[:,:,19] = else_mask
    
    return new_image.astype(np.float32)
'''

SyntaxError: invalid syntax (<ipython-input-7-f87b9657006a>, line 19)

In [5]:
def get_data(batch_size=1, num_classes=5, mode='train', imgs=train_imgs, gt=train_gt, im_size=500):
    
    # Expects sorted lists of training images and ground truth as
    # 'data' and 'labels'
    if (mode == 'val'):
        imgs = val_imgs
        gt = val_gt
    elif (mode == 'test'):
        imgs = test_imgs
        gt = test_gt
    
    # get shape from any image
    #shape_im = misc.imread(random_im_path)
    
    # Shuffle training dataset
    #if mode=='train':
    combined = list(zip(imgs, gt))  
    shuffle(combined)
    imgs[:], gt[:] = zip(*combined)
    
    while(True):
        for i in range(0,len(imgs),batch_size):
            #images = np.empty((batch_size, im_size, im_size, shape_im.shape[2]))
            images = np.empty((batch_size, im_size, im_size, 3))
            labels = np.empty((batch_size, im_size, im_size, num_classes))
            for j, img in enumerate(imgs[i:i+batch_size]):
                # Crop the size we want from a random spot in the image (as a form of
                # minor data augmentation)
                new_start_row = np.random.randint(0, 1024 - im_size)
                new_start_col = np.random.randint(0, 2048 - im_size)
                train_im = misc.imread(img).astype(np.float32)
                
                train_im = train_im[new_start_row:new_start_row+im_size, new_start_col:new_start_col+im_size]
                images[j,:,:,:] = train_im
                
                gt_im = misc.imread(gt[i+j])
                gt_im = gt_im[new_start_row:new_start_row+im_size, new_start_col:new_start_col+im_size]
                labels[j,:,:,:] = prepare_ground_truth(gt_im)
            
            yield(images,labels)
 
    '''
    for i in range(0,len(imgs),batch_size):
        #images = np.empty((batch_size, im_size, im_size, shape_im.shape[2]))
        images = np.empty((batch_size, im_size, im_size, 3))
        labels = np.empty((batch_size, im_size, im_size, num_classes))
        for j, img in enumerate(imgs[i:i+batch_size]):
            # Crop the size we want from a random spot in the image (as a form of
            # minor data augmentation)
            new_start_row = np.random.randint(0, 1024 - im_size)
            new_start_col = np.random.randint(0, 2048 - im_size)
            train_im = misc.imread(img).astype(np.float32)
            
            train_im = train_im[new_start_row:new_start_row+im_size, new_start_col:new_start_col+im_size]
            images[j,:,:,:] = train_im
            
            gt_im = misc.imread(gt[i+j])
            gt_im = gt_im[new_start_row:new_start_row+im_size, new_start_col:new_start_col+im_size]
            labels[j,:,:,:] = prepare_ground_truth(gt_im)
        
        yield(images,labels)
        '''

In [6]:
# visualize_prediction for cityscape data
def visualize_prediction(original_image, prediction):
    
    original_image = np.squeeze(original_image).astype(np.uint8)
    new_image = np.copy(original_image)
    prediction = np.squeeze(prediction)
    mask = np.argmax(prediction, axis=2)
    # road = green
    new_image[mask[:,:]==0, :] = [0,255,0]
    # sidewalk = blue
    new_image[mask[:,:]==1, :] = [0,0,255]
    # pedestrians = yellow
    new_image[mask[:,:]==2, :] = [255,255,0]
    # vehicles = red
    new_image[mask[:,:]==3, :] = [255,0,0]
    # else is left the same
    
    new_image = Image.blend(Image.fromarray(original_image, mode='RGB').convert('RGBA'),
                            Image.fromarray(new_image, mode='RGB').convert('RGBA'),
                            alpha=0.5)
    
    plt.imshow(new_image, interpolation='nearest')
    plt.show()

## Define FCN model

In [7]:
def vgg_conv(layer_input, vgg_dict, bn, name):
    
    with tf.variable_scope(name):
        
        # Get the conv filter
        init = tf.constant_initializer(value=vgg_dict[name][0], dtype=tf.float32)
        shape = vgg_dict[name][0].shape
        filt = tf.get_variable(name="filter", initializer=init, shape=shape)
        # Get the conv bias
        init = tf.constant_initializer(value=vgg_dict[name][1], dtype=tf.float32)
        shape = vgg_dict[name][1].shape
        bias = tf.get_variable(name="biases", initializer=init, shape=shape)
        # Construct conv layer
        conv = tf.nn.conv2d(layer_input, filt, [1, 1, 1, 1], padding='SAME')
        relu = tf.nn.relu(tf.nn.bias_add(conv, bias), name=name)
        return tf.contrib.layers.batch_norm(relu, center=True, scale=True, is_training=True)
    
# Helper function for deconvolutional layers
# Note that this function is different than the Unet deconv functions
def deconv2d(layer_input, output_shape, input_fsize, output_fsize, filter_size, stride, bn, name):
    w = tf.get_variable("filter_" + name, shape=[filter_size, filter_size, output_fsize, input_fsize], 
                        initializer=tf.contrib.layers.xavier_initializer_conv2d(), 
                        regularizer=tf.contrib.layers.l2_regularizer(.001))
    b = tf.Variable(tf.constant(0.1, shape=[output_fsize]))
    new_shape = tf.stack([output_shape[0], output_shape[1], output_shape[2], output_fsize])
    deconv = tf.nn.conv2d_transpose(layer_input, w, new_shape, strides=[1, stride, stride, 1], padding='SAME')
    layer = tf.nn.elu(tf.add(deconv, b))
    return tf.contrib.layers.batch_norm(layer, center=True, scale=True, is_training=True)

def conv_layer(layer_input, input_fsize, output_fsize, filter_size, bn, name):
    w = tf.get_variable("filter_" + name, shape=[filter_size, filter_size, input_fsize, output_fsize], 
                        initializer=tf.contrib.layers.xavier_initializer_conv2d(), 
                        regularizer=tf.contrib.layers.l2_regularizer(.001))
    b = tf.Variable(tf.constant(0.1, shape=[output_fsize]))
    layer = tf.nn.elu(tf.add(tf.nn.conv2d(layer_input, w, strides=[1, 1, 1, 1], padding='SAME'), b))
    return tf.contrib.layers.batch_norm(layer, center=True, scale=True, is_training=True)


In [8]:
def create_FCN_VGG(imgs, vgg_dict, dropout=0.5, channels=3, num_classes=5, filter_size=1, bn=True):
    
    im_rows = tf.shape(imgs)[1]
    im_cols = tf.shape(imgs)[2]
    
    x_image = tf.reshape(imgs, tf.stack([-1,im_rows,im_cols,channels]))
    layer_input = x_image
    batch_size = tf.shape(x_image)[0]
    
    # VGG pre-processing
    red, green, blue = tf.split(x_image, 3, 3)

    # From the VGG paper
    vgg_means = [103.939, 116.779, 123.68]
    x_image = tf.concat([
        blue - vgg_means[0],
        green - vgg_means[1],
        red - vgg_means[2]], axis=3)
    
    x_image = x_image / 122.5
    
    # Encoder
    conv1_1 = vgg_conv(x_image, vgg_dict, bn, "conv1_1")
    conv1_2 = vgg_conv(conv1_1, vgg_dict, bn, "conv1_2")
    pool1 = tf.nn.max_pool(conv1_2, ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME', name="pool1")
    conv2_1 = vgg_conv(pool1, vgg_dict, bn, "conv2_1")
    conv2_2 = vgg_conv(conv2_1, vgg_dict, bn, "conv2_2")
    pool2 = tf.nn.max_pool(conv2_2, ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME', name="pool2")
    conv3_1 = vgg_conv(pool2, vgg_dict, bn, "conv3_1")
    conv3_2 = vgg_conv(conv3_1, vgg_dict, bn, "conv3_2")
    conv3_3 = vgg_conv(conv3_2, vgg_dict, bn, "conv3_3")
    pool3 = tf.nn.max_pool(conv3_3, ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME', name="pool3")
    conv4_1 = vgg_conv(pool3, vgg_dict, bn, "conv4_1")
    conv4_2 = vgg_conv(conv4_1, vgg_dict, bn, "conv4_2")
    conv4_3 = vgg_conv(conv4_2, vgg_dict, bn, "conv4_3")
    pool4 = tf.nn.max_pool(conv4_3, ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME', name="pool4")
    conv5_1 = vgg_conv(pool4, vgg_dict, bn, "conv5_1")
    conv5_2 = vgg_conv(conv5_1, vgg_dict, bn, "conv5_2")
    conv5_3 = vgg_conv(conv5_2, vgg_dict, bn, "conv5_3")
    pool5 = tf.nn.max_pool(conv5_3, ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME', name="pool5")

    # VGG depth at the fifth layer (comes from imported weights so
    # it will be constant no matter the size of input image)
    features = 512
        
    # Decoder
    # Combo layer 1
    d_conv1 = conv_layer(pool5, features, num_classes, filter_size, bn, "decoder_c1")
    d_dconv1 = deconv2d(d_conv1, tf.shape(pool4), num_classes, num_classes, 4, 2, bn, "decoder_d1")
    
    d_conv2 = conv_layer(pool4, features, num_classes, filter_size, bn, "decoder_c2")
    
    # Skip connection
    d_sum1 = tf.add(d_dconv1, d_conv2)
    
    # Combo layer 2
    d_dconv2 = deconv2d(d_sum1, tf.shape(pool3), num_classes, num_classes, 4, 2, bn, "decoder_d2")
    
    d_conv3 = conv_layer(pool3, features//2, num_classes, filter_size, bn, "decoder_c3")
    
    # Skip connection
    d_sum2 = tf.add(d_dconv2, d_conv3)
    
    # Combo layer 3
    d_dconv3 = deconv2d(d_sum2, tf.shape(x_image), num_classes, num_classes, 16, 8, bn, "decoder_d3")

    output = d_dconv3
        
    return output

In [9]:
class FCN_VGG:
    
    def __init__(self, channels=3, num_classes=5, output_path='./output'):
        
        # prediction path is to store validation prediction images in if necessary
        self.prediction_path = "./predictions"
        
        # output path for trained model 
        self.output_path = output_path
        
        if not os.path.exists(self.prediction_path):
            os.mkdir(self.prediction_path)
        if not os.path.exists(self.output_path):
            os.mkdir(self.output_path)
                
        tf.reset_default_graph()
        
        self.num_classes = num_classes
        
        # load the VGG weights : vgg16.npy
        # downloaded into current directory from ftp://mi.eng.cam.ac.uk/pub/mttt2/models/vgg16.npy
        vgg_path = 'vgg16.npy'
        self.vgg_dict = np.load(vgg_path, encoding='latin1').item()
        
        # images as input
        self.x = tf.placeholder("float", shape=[None, None, None, channels], name="x")
        
        # prediction output
        self.y = tf.placeholder("float", shape=[None, None, None, num_classes], name="y")
        
        # dropout prob.
        self.drop_pl = tf.placeholder("float", name="do")
        
        # batch norm
        self.bn = tf.placeholder("bool", name="bn")
        
        # output logit
        logits = create_FCN_VGG(self.x, self.vgg_dict, dropout=self.drop_pl, channels=channels, num_classes=num_classes)
        self.logits = logits
        
        # convert to prob. using softmax
        self.soft = self.pixel_wise_softmax(logits)
        self.soft = tf.identity(self.soft, name="output")
        self.loss = self.get_loss(logits)
                
        # These two are for validation
        self.correct_pred = tf.equal(tf.argmax(tf.squeeze(self.soft), -1), tf.argmax(tf.squeeze(self.y), -1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))
            
    # A helper function for computing softmax across the channel-dimension
    def pixel_wise_softmax(self, output):
                
        # Softmax across the last dimension (where each channel is a binary image
        # denoting whether the pixel belongs in that category or not -- this allows
        # for more than two classes)
        # e^x
        
        # output is logits
        exponential_map = tf.exp(output)
        # sum-e^x
        sum_exp = tf.reduce_sum(exponential_map, 3, keep_dims=True)
        # duplicate the last summed dimension
        tensor_sum_exp = tf.tile(sum_exp, tf.stack([1, 1, 1, tf.shape(output)[3]]))
        # divide e^x by sum-e^x 
        return tf.div(exponential_map, tensor_sum_exp)
        
    def get_loss(self, logits):

        cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=self.y, logits=logits))
        
        reg_loss = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
        
        cost += reg_loss
        
        return cost

    
    def predict(self, model_path, img, gt=None, restore=True):
    
        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            
            sess.run(init)
            # model_path = 'fcn_vgg_city2.ckpt'
            if restore:
                self.restore(sess, os.path.join(self.output_path, model_path))
            
            if gt != None:
                soft, acc = sess.run([self.soft, self.accuracy], feed_dict={self.x: img, self.y: gt,
                                                                            self.drop_pl: 1.0, self.bn: False})
                return soft, acc
            else:
                soft = sess.run(self.soft, feed_dict={self.x: img, self.drop_pl: 1.0, self.bn: False})
                return soft
    
    def save(self, sess, model_path):
        # save the variabless in sess (including weights and model structures) into model_path ('fcn_vgg_city2.ckpt')
        saver = tf.train.Saver()
        save_path = saver.save(sess, model_path)
        return save_path
    
    def restore(self, sess, model_path):
        
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
    
    def train(self, data_generator, dropout=0.5, training_iters=10, learning_rate=0.0001, epochs=10, display_step=10, restore=False, model_path=None):
        # output_path = './output', model_path = 'fcn_vgg_city2.ckpt'
        # then model_path = './output/fcn_vgg_city2.ckpt'
        model_path = os.path.join(self.output_path, model_path)
                
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)
        
        init = tf.global_variables_initializer()
        
        with tf.Session() as sess:
            
            print("Session begun")
            
            sess.run(init)
        
            if restore:
                self.restore(sess, model_path)

            for epoch in range(epochs):
                
                print("Starting new epoch")
                
                total_loss = 0.0
                display_loss = 0.0
                display_acc = 0.0
                best_acc = .93
                for step in range((epoch*training_iters), ((epoch+1)*training_iters)):
                    
                    x_batch, y_batch = next(data_generator)
                    
                    _, loss, acc, logits = sess.run((self.optimizer, self.loss, self.accuracy, self.logits),
                                                    feed_dict={self.x: x_batch,
                                                              self.y: y_batch,
                                                              self.drop_pl: dropout,
                                                              self.bn: True})
                    
                    display_loss += loss
                    display_acc += acc
                    
                    if ((step+1) % display_step == 0):
                        print("At iteration {} loss equals {} and accuracy equals {}".format(step+1, display_loss/(display_step), display_acc/(display_step)))
                        
                        if ((display_acc/(display_step)) > best_acc):
                            save_path = self.save(sess, os.path.join(self.output_path, model_path))
                            best_acc = display_acc/display_step
                            print("model saved")
                        total_loss += display_loss
                        display_loss = 0.0
                        display_acc = 0.0
                        
                                                                    
                print("###############################")
                print("Epoch", epoch + 1, "average loss =", total_loss/training_iters)
                print("###############################")
                total_loss = 0                
                
            #save_path = self.save(sess, os.path.join(self.output_path, model_path))
            save_path = self.save(sess, os.path.join(model_path))
        
        return save_path

## Perform training

In [10]:
tf.reset_default_graph()
my_fcn = FCN_VGG()

# Not enough RAM for large batch given
# the size of these images
batch_generator = get_data(mode='train', batch_size=1, im_size=500)
#batch_generator = get_data(mode='val', batch_size=10, im_size=500)
model_path = 'fcn_vgg_city3.ckpt'

In [None]:
output_path = my_fcn.train(batch_generator, learning_rate=8e-4,
                          training_iters=200, epochs=1, display_step=100,
                          restore=False, model_path=model_path)