In [None]:
import tensorflow as tf
import numpy as np
import os
from matplotlib import *
from PIL import Image,ImageDraw
from xml.etree import cElementTree as ElementTree
from tqdm import tqdm

In [None]:
!wget 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar'

In [None]:
!tar -xvf VOCtrainval_11-May-2012.tar

In [None]:
offset = 0
def data_generator(noi=64):

    voc_dir = '/content/VOCdevkit/VOC2012/'
   
    image_dir = voc_dir+'JPEGImages/'

    global offset
    img_names = os.listdir( image_dir )[offset:offset+noi]

    all_imgs = []

    for each_img_file in img_names:

      img = Image.open( image_dir+each_img_file )
      img_arr = np.array( img.resize((224,224)) )
      all_imgs.append( img_arr/255.0 )

    # print(img_names)

    label_dir = voc_dir+'Annotations/'
    all_labels = []

    for each_img in img_names: 

        label_xml= each_img[:-3]+'xml'
        
        tree = ElementTree.parse(label_dir+label_xml)
        root = tree.getroot()

        img_objs = []
        h = int( (root.find('size')).find('height').text )
        w = int( (root.find('size')).find('width').text )

        for obj in root.iter('object'):

            obj_dict={}

            obj_dict['name'] = obj.find('name').text

            box = obj.find('bndbox')
            obj_dict['x_max'] = int( box.find('xmax').text )/w
            obj_dict['x_min'] = int( box.find('xmin').text )/w
            obj_dict['y_max'] = int( box.find('ymax').text )/h
            obj_dict['y_min'] = int( box.find('ymin').text )/h
            
            img_objs.append( obj_dict )

        all_labels.append( img_objs )

    offset=offset+noi    

    return all_imgs,all_labels

In [None]:
imgs,labels = data_generator(64)
# print( labels[1] )

In [None]:
print( len(imgs) )
print( imgs[0].shape )

64
(224, 224, 3)


In [None]:
# View sample images
for i in range( len(imgs[:10]) ):

    img_arr = np.array( imgs[i] )
    figure, ax = pyplot.subplots(1)

    ax.imshow(img_arr)

    for ol in labels[i]:

      start_x,start_y = ol['x_min']*224,ol['y_min']*224
      w = (ol['x_max']*224) - (ol['x_min']*224)
      h = (ol['y_max']*224) - (ol['y_min']*224)
      rect =  patches.Rectangle((start_x,start_y),w,h, edgecolor='r', facecolor="none")
      ax.add_patch(rect)
      ax.title.set_text(ol['name'])


In [None]:
class_to_int = {'aeroplane':0,'bicycle':1,'bird':2,'boat':3,'bottle':4,'bus':5,'car':6 ,'cat':7,'chair':8, 'cow':9, 'diningtable':10, 'dog':11, "horse":12, 'motorbike':13, 'person':14 , 'pottedplant':15, 'sheep':16 , 'sofa':17 , 'train':18 , 'tvmonitor':19 }

def get_feed_label(labels,S=7,noa=1,classes=20):

    feed_label = np.zeros( (len(labels),S*S,(classes+(noa*5))) )
    
    for i,each_label in enumerate(labels):

        for each_obj in each_label:
          
          cx = (each_obj['x_max']+each_obj['x_min'])/2
          cy = (each_obj['y_max']+each_obj['y_min'])/2
          div = 1.0/S
          
          row = int(cy/div)
          col = int(cx/div)
          
          j = ( row*S+col-1)


          feed_label[i][j][0] = 1
          feed_label[i][j][1] = each_obj['x_max']
          feed_label[i][j][2] = each_obj['x_min']
          feed_label[i][j][3] = each_obj['y_max']
          feed_label[i][j][4] = each_obj['y_min']

          int_class = class_to_int[ each_obj['name'] ]
          feed_label[i][j][int_class+5] = 3

    
    return feed_label

In [None]:
y_batch = get_feed_label(labels)

In [None]:
y_batch.shape

In [None]:
for i in range(0,10):

    img_arr = np.array( imgs[i] )
    figure, ax = pyplot.subplots(1)

    ax.imshow(img_arr)

    for ol in y_batch[i]:
      
      # print(ol)
      if ol[0]==1:
        start_x,start_y = ol[2]*224,ol[4]*224
        w = (ol[1]*224) - (ol[2]*224)
        h = (ol[3]*224) - (ol[4]*224)
        rect =  patches.Rectangle((start_x,start_y),w,h, edgecolor='r', facecolor="none")
        ax.add_patch(rect)
        ax.title.set_text( np.argmax(ol[5:]) )


In [None]:
resnet = tf.keras.applications.ResNet50(weights="imagenet",include_top=False,input_shape=(224,224,3))
resnet.trainable = True
resnet.summary()

In [None]:
inputs = tf.keras.layers.Input((224,224,3))
# res_out = resnet(inputs)
out = tf.keras.layers.Conv2D(128,(3,3),padding='same',activation='relu')(inputs)
out = tf.keras.layers.Conv2D(128,(3,3),padding='same',activation='relu')(out)
out = tf.keras.layers.MaxPool2D((2,2))(out)

out = tf.keras.layers.Conv2D(512,(3,3),padding='same',activation='relu')(out)
out = tf.keras.layers.MaxPool2D((2,2))(out)

out = tf.keras.layers.Conv2D(512,(3,3),padding='same',activation='relu')(out)
out = tf.keras.layers.Conv2D(512,(3,3),padding='same',activation='relu')(out)
out = tf.keras.layers.MaxPool2D((2,2))(out)
out = tf.keras.layers.Conv2D(1024,(3,3),padding='same',activation='relu')(out)
out = tf.keras.layers.MaxPool2D((2,2))(out)

out = tf.keras.layers.Conv2D(1024,(3,3),padding='same',activation='relu')(out)

flat_out = tf.keras.layers.GlobalAveragePooling2D()(out)
dense_out = tf.keras.layers.Dense(1024,activation='relu')(flat_out)
outputs = tf.keras.layers.Dense(1225)(dense_out)

model = 0
model = tf.keras.Model(inputs=inputs,outputs=outputs)
model.summary()

In [None]:
model.summary()

In [None]:
stack_y=[] 
for i in range(y_batch.shape[0]):
  stack_y.append( np.hstack(y_batch[i]) )

In [None]:
print( len(stack_y) )
print( stack_y[0].shape )

In [None]:
print(imgs[0].shape)

In [None]:
print( tf.constant([1,2,3])*tf.constant([3,2,1]) )

In [None]:
# hyperparams
epochs = 20
l_rate = 0.000005
batch_size = 1
num_ex = 10
num_of_batches = int(num_ex/batch_size)
print(num_of_batches)
# ******************

# cost_fn = tf.keras.losses.MeanSquaredError()

def cost_fn(y_batch,pred_1d):

    batch_loss = 0.0
    grid_loss = 0.0
    lamda = 3.0

    # batch iteration
    for i in range(pred_1d.shape[0]):

        grid_loss = 0.0
        pred_matrix = pred_1d[i]

        p = tf.cast( tf.reshape( pred_matrix , (-1, 25) ) , dtype=tf.float64 )
        y = tf.reshape( y_batch[i] , (-1,25) )

        for grid_num in range( p.shape[0] ):
            
            ol = tf.math.square( y[grid_num][0] - p[grid_num][0] )
            l1 = tf.math.square( y[grid_num][1] - p[grid_num][1] )
            l2 = tf.math.square( y[grid_num][2] - p[grid_num][2] )
            l3 = tf.math.square( y[grid_num][3] - p[grid_num][3] )
            l4 = tf.math.square( y[grid_num][4] - p[grid_num][4] )
            class_loss =  tf.reduce_sum( tf.math.square( y[grid_num][5:] - p[grid_num][5:] ) )   
            # class_loss =  tf.keras.losses.SparseCategoricalCrossentropy( tf.y[grid_num][5:] , p[grid_num][5:] )   

            loss = ol+4.0*y[grid_num][0]*(l1+l2+l3+l4)+ class_loss

            grid_loss = grid_loss + loss

        print("Image Loss ==={}".format(grid_loss) )

        batch_loss = batch_loss+grid_loss
    
    return batch_loss

optimizer = tf.keras.optimizers.RMSprop(l_rate)
mean = tf.keras.metrics.Mean()
ctr = 0
# ******************

with tf.device('/device:GPU:0'):
  
  for e in range(epochs):

    offset = 0
    ctr=0

    mean.reset_states()
    
    for n in range(num_of_batches):

      imgs,labels = data_generator( 1 )
      y_feed_label = get_feed_label(labels)

      stack_y=[] 
      for i in range(y_feed_label.shape[0]):
        stack_y.append( np.hstack(y_feed_label[i]) )

      x_batch = tf.convert_to_tensor(imgs)
      y_batch = tf.convert_to_tensor(stack_y)

      with tf.GradientTape() as tape:
          pred = model(x_batch)

          cost = cost_fn( y_batch, pred )
          mean.update_state(cost)

      grads = tape.gradient( cost ,model.trainable_variables)
      optimizer.apply_gradients( zip(grads,model.trainable_variables) )

      if ctr%10==0:
          print("Loss  ====>  {}".format(mean.result()))
          
      ctr = ctr+1 

    print("Epoch {}  ====>  {}".format(e,mean.result()))


In [None]:
model.trainable_variables

In [None]:
model.save('yolo.h5')

In [None]:
def pred_one(one_img):

    # img of 4 dims
    pred = model( one_img )
    print( pred )
    pred_mat = np.reshape( pred , (-1, 25))
    
    img_arr = np.array( one_img[0] )
    figure, ax = pyplot.subplots(1)
    ax.imshow(img_arr)

    print( pred_mat.shape )
    rect = []

    for each_grid in pred_mat:

        if each_grid[0]>0.5:

          x_max = each_grid[1]
          x_min = each_grid[2]
          y_max = each_grid[3]
          y_min = each_grid[4]

          print((x_max,x_min,y_max,y_min))
          print((x_max*224,x_min*224,y_max*224,y_min*224))
          
          print(each_grid)
          class_int = np.argmax( each_grid[5:] )
          print( "class is === >  {}".format(class_int) )

          start_x,start_y = x_min*224,y_min*224
          w = (x_max*224) - (x_min*224)
          h = (y_max*224) - (y_min*224)
          ax.add_patch( patches.Rectangle((start_x,start_y),w,h, edgecolor='r', facecolor="none") )

    return

In [None]:
offset=0
imgs,labels = data_generator(10)
single_img = np.expand_dims( imgs[8] , 0 )
pred_one(single_img)