In [1]:
from tensorflow.contrib import layers
from tensorflow.contrib.framework.python.ops import arg_scope
from tensorflow.contrib.layers.python.layers import layers as layers_lib
from tensorflow.contrib.layers.python.layers import regularizers
from tensorflow.contrib.layers.python.layers import utils
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import variable_scope
import tensorflow.contrib.slim as slim
import tensorflow as tf
import os

from scipy.misc import imresize

from math import floor,exp
import pprint

import matplotlib.image as mpimg
import numpy as np
import matplotlib.pylab as plt

In [2]:

anchor_box_scales = [128, 256, 512]
anchor_box_ratio = [[1,1],[1,2],[2,1]]


TEST_FULL_IMG = np.array([mpimg.imread("./test1.jpg")])

In [3]:
print(TEST_FULL_IMG.shape)
print(TEST_FULL_IMG.shape[1]/4, TEST_FULL_IMG.shape[2]/4)

(1, 720, 1280, 3)
180.0 320.0


In [31]:
def vgg_16(inputs,  scope='vgg_16'):
    with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
        end_points_collection = sc.name + '_end_points'

        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection):
            net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
#             net = slim.max_pool2d(net, [2, 2], scope='pool3')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
#             net = slim.max_pool2d(net, [2, 2], scope='pool4')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')

            # Convert end_points_collection into a end_point dict.
            end_points = slim.utils.convert_collection_to_dict(end_points_collection)
        
    return net, end_points


def rpn(net, num_anchors=9, scope="rpn"):
    with tf.variable_scope(scope, 'rpn', [net]) as sc:
        end_points_collection = sc.name + '_end_points'
        
        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], 
                            outputs_collections=end_points_collection, 
                            activation_fn=tf.nn.relu,
                            weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)):
            
            net = slim.conv2d(net, 512, [3, 1], scope='rpn_conv_3x3', padding='SAME')
            
            rpn_class = slim.conv2d(net, num_anchors, [1, 1], scope='rpn_class')
            
            rpn_regr = slim.conv2d(net, num_anchors*4, [1, 1], scope='rpn_regr')   

            # Convert end_points_collection into a end_point dict.
            end_points = slim.utils.convert_collection_to_dict(end_points_collection)
        
    return rpn_class, rpn_regr, end_points

def mapAnchorToBoxs(rpn_class, rpn_regr, feature_map_ratio, ratio=[(1,1),(2,1),(1,2)], pixel=[64,128,512]):
    
    class_res  = [];
    regr_res = [];
    
    
    for row in range(rpn_class.shape[0]):
        for col in range(rpn_class.shape[1]):
            i = 0
            for r in ratio:
                j = 0
                for p in pixel:
                    anchor_width = r[0]*p / feature_map_ratio
                    anchor_height = r[1]*p / feature_map_ratio
                    
                    p_anchor = rpn_regr[row][col][i+j]
                    
                    cx = row + p_anchor[0]*anchor_width
                    cy = col + p_anchor[1]*anchor_height
                    pred_w = exp(p_anchor[2]) * anchor_width
                    pred_h = exp(p_anchor[3]) * anchor_height
                    
                    x1 = int(cx - 0.5 * pred_w)
                    y1 = int(cy - 0.5 * pred_h)
                    x2 = int(cx + 0.5 * pred_w)
                    y2 = int(cy + 0.5 * pred_h)
                    
                    regr_res.append([x1,y1,x2,y2])
                    class_res.append(rpn_class[row][col][i+j])
#                     print("rpn_class[row][col][i+j]",rpn_class[row][col][i+j])
                    
                    
                    j+=1
                i+=1
    
    return np.array(class_res), np.array(regr_res)


# def bbox_transform_inv(boxes, deltas):
#     if boxes.shape[0] == 0:
#         return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)

#     boxes = boxes.astype(deltas.dtype, copy=False)

#     widths = boxes[:, 2] - boxes[:, 0] + 1.0
#     heights = boxes[:, 3] - boxes[:, 1] + 1.0
#     ctr_x = boxes[:, 0] + 0.5 * widths
#     ctr_y = boxes[:, 1] + 0.5 * heights

#     dx = deltas[:, 0::4]
#     dy = deltas[:, 1::4]
#     dw = deltas[:, 2::4]
#     dh = deltas[:, 3::4]

#     pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
#     pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
#     pred_w = np.exp(dw) * widths[:, np.newaxis]
#     pred_h = np.exp(dh) * heights[:, np.newaxis]

#     pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
#     # x1
#     pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
#     # y1
#     pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
#     # x2
#     pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
#     # y2
#     pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h

#     return pred_boxes


def getLargest(a,b):
    if (a>b):
        return a
    else :
        return b

def getSmallest(a,b):
    if(a<b):
        return a
    else:
        return b
    
    
def clip_boxes(boxes, im_shape_col, im_shape_row):
    """
    Clip boxes to image boundaries.
    """
    for box in boxes:
        box[0] = getLargest( getSmallest(box[0], im_shape_col) ,0)
        box[1] = getLargest( getSmallest(box[1], im_shape_row) ,0)
        box[2] = getLargest( getSmallest(box[2], im_shape_col) ,0)
        box[3] = getLargest( getSmallest(box[3], im_shape_row) ,0)
            
    return boxes

def filter_boxs_by_size(boxes, threshold=30):
    
    ws = boxes[:, 2] - boxes[:, 0] + 1
    hs = boxes[:, 3] - boxes[:, 1] + 1
    keep = np.where((ws >= threshold) & (hs >= threshold))[0]
    return keep

def nms(dets, thresh):
    """Pure Python NMS baseline."""
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return keep

def rpn_proposal_layer(rpn_class, rpn_regr, img_input):
    ''' 
    This function map anchor to the position, which return x,y,w,h with globel coor. 
    rather than anchor inner position.
    img_input is use for get the imgSize ref.
    '''
    
    print("rpn_class.shape", rpn_class.shape)
    print("rpn_regr.shape", rpn_regr.shape)
    print("img_input.shape", img_input.shape)
    
    img_width = img_input.shape[2]
    img_height = img_input.shape[1]
    
    feature_map_width = rpn_regr.shape[2]
    feature_map_height = rpn_regr.shape[1]
    
    img_to_feature_map_ratio = img_width/feature_map_width
    
    nb_anchors = int(rpn_class.shape[3])
    
#     reshape the array by anchors 
    rpn_class_reshaped = np.reshape(rpn_class, (int(rpn_class.shape[1]), int(rpn_class.shape[2]),nb_anchors, 1))
    rpn_regr_reshaped = np.reshape(rpn_regr, (int(rpn_regr.shape[1]), int(rpn_regr.shape[2]),nb_anchors, 4)) 
    
    class_res, regr_res = mapAnchorToBoxs(rpn_class_reshaped, rpn_regr_reshaped, img_to_feature_map_ratio)
    
    # 2. clip predicted boxes to image
    regr_res = clip_boxes(regr_res, int(rpn_class.shape[2]), int(rpn_class.shape[1]))
    print("class_res.shape",class_res.shape)
    print("regr_res.shape",regr_res.shape)
    
    # filter by size
    id_keep = filter_boxs_by_size(regr_res, 30)
    print("id_keep.shape",id_keep.shape)
    class_res_keep = class_res[id_keep]
    regr_res_keep = regr_res[id_keep, :]
    
    # sort 
    # getTop N 
    pre_nms_topN = 6000
    
    order = class_res_keep.ravel().argsort()[::-1]
    if pre_nms_topN > 0:
        order = order[:pre_nms_topN]
    proposals = regr_res_keep[order, :]
    scores = class_res_keep[order]
    
    print("proposals.shape",proposals.shape)    
    print("scores.shape",scores.shape)
    
    # 6. apply nms (e.g. threshold = 0.7)
    # 7. take after_nms_topN (e.g. 300)
    # 8. return the top proposals (-> RoIs top)
    post_nms_topN = 300
    nms_thresh = 0.6
    keep = nms(np.hstack((proposals, scores)), nms_thresh)
    if post_nms_topN > 0:
        keep = keep[:post_nms_topN]
    proposals = proposals[keep, :]
    scores = scores[keep]
    print("proposals.shape",proposals.shape)    
    print("scores.shape",scores.shape)
    
    return proposals.astype(np.float32) 

In [32]:
def getBoxIds(poposal_res):
    print("poposal_res.shape",poposal_res.shape)
    return np.zeros(len(poposal_res)).astype(np.int32)

In [34]:
nb_anchors = len(anchor_box_scales) * len(anchor_box_ratio)

img_input = tf.placeholder(tf.float32, [1, None, None, 3])

crop_size = tf.constant([14,14])

conv_layer,conv_end_points = vgg_16(img_input)
conv_restore_names = [ item for item in conv_end_points] 

rpn_class, rpn_regr, rpn_end_points = rpn(conv_layer,nb_anchors)

# poposal layer
roi_proposal = tf.py_func(rpn_proposal_layer,[rpn_class, rpn_regr, img_input], tf.float32, name="roi_proposal")

# pooling layer
get_box_ids = tf.py_func(getBoxIds, [roi_proposal],  tf.int32 );

roi_pooling = tf.image.crop_and_resize(conv_layer,roi_proposal, get_box_ids, crop_size)
roi_pooling = tf.nn.max_pool(roi_pooling, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],  padding='SAME')

flatten = slim.flatten(roi_pooling, scope="flatten")
fc6 = slim.fully_connected(flatten, 4096)
fc7 = slim.fully_connected(fc6, 4096)

cls_score = slim.fully_connected(fc7, 21)  # 20 class + bg 
cls_score = tf.nn.softmax(cls_score)

bbox_pred = slim.fully_connected(fc7, 84) # each class have it's own bbox

# restore weights
variables_to_restore = slim.get_variables_to_restore(include=conv_restore_names)
vgg_checkpoint_path = os.path.join("./", 'vgg_16.ckpt')
restorer = tf.train.Saver(variables_to_restore)


init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    # Restore variables from disk.
    restorer.restore(sess, "./vgg_16.ckpt")
    
    sess.run(init_op)
    print("restore conv layers")
    
    res, rpn_class_res, rpn_regr_res, roi_proposal_res, roi_pooling_res = sess.run([conv_layer, rpn_class, rpn_regr, roi_proposal, roi_pooling], feed_dict={img_input:TEST_FULL_IMG})
#     plt.imshow(res[0], cmap='gray')
    

restore conv layers
rpn_class.shape (1, 180, 320, 9)
rpn_regr.shape (1, 180, 320, 36)
img_input.shape (1, 720, 1280, 3)
class_res.shape (518400, 1)
regr_res.shape (518400, 4)
id_keep.shape (213659,)
proposals.shape (6000, 4)
scores.shape (6000, 1)
proposals.shape (15, 4)
scores.shape (15, 1)
poposal_res.shape (15, 4)


In [35]:
print(roi_pooling_res.shape)

(15, 7, 7, 512)


In [None]:
roi_proposal_res[0].shape

In [None]:
res.shape

In [9]:
rpn_class_res.shape

(1, 180, 320, 9)

In [10]:
rpn_regr_res.shape

(1, 180, 320, 36)

In [11]:
# print(rpn_end_points)

In [12]:
res[0].shape

(180, 320, 512)