In [2]:
import torch
import pickle
import numpy as np
import torchvision.models as models
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from generate_proposals import GenerateProposals
from roi_align import RoIAlignFunction, preprocess_rois
import utils.vis as vis_utils
import utils.result_utils as result_utils
import skimage.io as io
from utils.blob import prep_im_for_blob
import utils.dummy_datasets as dummy_datasets
from PIL import Image

In [3]:
R_101_FPN_PATH = "/home/at3577/driveME/src/fair-maskrcnn/models/35861858-R-101-FPN/model_final.pkl"
X_152_32x8d_FPN_IN5k_PATH = "/home/at3577/driveME/src/fair-maskrcnn/models/37129812-X-152-32x8d-FPN-IN5k/model_final.pkl"
R_101_PATH = "/home/at3577/driveME/src/fair-maskrcnn/models/R-101/R-101.pkl"
X_152_32x8d_IN5k_PATH = "/home/at3577/driveME/src/fair-maskrcnn/models/X-152-32x8d-IN5k/X-152-32x8d-IN5k.pkl"

R_50_C4_PATH = "/home/at3577/driveME/src/fair-maskrcnn/models/tmp/model_final.pkl"
R_50_PATH = "/home/at3577/driveME/src/fair-maskrcnn/models/tmp/R-50.pkl"

In [4]:
class ResnetModel(nn.Module):
    def __init__(self, backbone_architecture, pretrained_model_file, resnet_feature_extraction_layers=['conv1','bn1','relu','maxpool','layer1','layer2','layer3']):
        super(ResnetModel, self).__init__()
        
        
        self.resnet_model = eval('models.' + backbone_architecture + '()') # construct ResNet model (maybe not very safe :) 

        # swap stride (2,2) and (1,1) in first layers (PyTorch ResNet is slightly different to caffe2 ResNet)
        # this is required for compatibility with caffe2 models
        self.resnet_model.layer2[0].conv1.stride=(2,2)
        self.resnet_model.layer2[0].conv2.stride=(1,1)
        self.resnet_model.layer3[0].conv1.stride=(2,2)
        self.resnet_model.layer3[0].conv2.stride=(1,1)
        self.resnet_model.layer4[0].conv1.stride=(2,2)
        self.resnet_model.layer4[0].conv2.stride=(1,1)
        
        self.init_weights(pretrained_model_file)
        
        # All except the last layer are used as feature extractor... Last layer is for ROI pooling
        self.model = torch.nn.Sequential(*[getattr(self.resnet_model, layer) for layer in resnet_feature_extraction_layers])
        self.model.eval()
        
    def forward(self, image):
        return self.model(image)
    
    def init_weights(self, pretrained_model_file):
        with open(pretrained_model_file, 'rb') as model_pickle_file:
            fb_model = pickle.load(model_pickle_file)
            # Model has two keys- config and blobs
            fb_model = fb_model['blobs']
        
        model_dict = self.resnet_model.state_dict()
        
        for key in model_dict.keys():
            # skip running mean/std and fc weights
            # I am not sure what running is but fc is the last fuly connected layer of resnet.. so fb model doesnt have it
            if 'running' in key or 'fc' in key:
                continue
            
            fb_key = self.convert_key_to_fb_format(key.split('.'))
           
            assert model_dict[key].size()==torch.FloatTensor(fb_model[fb_key]).size()
            
            if key=='conv1.weight': # convert from BGR to RGB                
                model_dict[key]=torch.FloatTensor(fb_model[fb_key][:,(2, 1, 0),:,:])
            else:
                model_dict[key]=torch.FloatTensor(fb_model[fb_key])
        
        # update model
        self.resnet_model.load_state_dict(model_dict)

    def convert_key_to_fb_format(self, terms, i=0, parsed=''):
        # Convert PyTorch ResNet weight names to caffe2 weight names
        if i==0:
            if terms[i]=='conv1':
                parsed='conv1'
            elif terms[i]=='bn1':
                parsed='res_conv1'
            elif terms[i].startswith('layer'):
                parsed='res'+str(int(terms[i][-1])+1)
        else:
            if terms[i]=='weight' and (terms[i-1].startswith('conv') or terms[i-1]=='0'):
                parsed+='_w'
            elif terms[i]=='weight' and (terms[i-1].startswith('bn') or terms[i-1]=='1'):
                parsed+='_bn_s'
            elif terms[i]=='bias' and (terms[i-1].startswith('bn') or terms[i-1]=='1'):
                parsed+='_bn_b'
            elif terms[i-1].startswith('layer'):
                parsed+='_'+terms[i]
            elif terms[i].startswith('conv') or terms[i].startswith('bn'):
                parsed+='_branch2'+chr(96+int(terms[i][-1]))
            elif terms[i]=='downsample':
                parsed+='_branch1'
        # increase counter
        i+=1
        # do recursion
        if i==len(terms):
            return parsed
        return self.convert_key_to_fb_format(terms,i,parsed)

In [5]:
class RegionProposalNetwork(nn.Module):
    def __init__(self, pretrained_model_file, feature_extractor_output_channels, rpn_conv_output_channels, number_of_anchors):
        super(RegionProposalNetwork, self).__init__()
        
        #RPN is used propose regions with probability of foreground/background.. i.e just tell if object is present
        # It has 3 parts:
        # 1) 3x3 conv with 512/1024 channels
        # 2) 1x1 conv with 2k channels (for each anchor box we predict foreground/background)
        # 3) 1x1 conv with 4k channels (for each anchor box we predict delta of boxes)
        
        self.conv_rpn = torch.nn.Conv2d(in_channels=feature_extractor_output_channels,
                                        out_channels=rpn_conv_output_channels,
                                        kernel_size=3,
                                        stride=1,
                                        padding=1)
        self.rpn_cls_prob = torch.nn.Conv2d(in_channels=rpn_conv_output_channels,
                                            out_channels=number_of_anchors,
                                            kernel_size=1,
                                            stride=1,
                                            padding=0)
        self.rpn_bbox_pred = torch.nn.Conv2d(in_channels=rpn_conv_output_channels,
                                             out_channels=4*number_of_anchors,
                                             kernel_size=1,
                                             stride=1,
                                             padding=0)
        
        self.init_weights(pretrained_model_file)
        
    def forward(self, anchor_features):
        # image features shape should be (N,Cin,H,W)  
        conv_anchor_features = F.relu(self.conv_rpn(anchor_features))
        anchor_cls_prob =  F.softmax(self.rpn_cls_prob(conv_anchor_features))
        anchor_box_pred =  self.rpn_bbox_pred(conv_anchor_features)
        return anchor_cls_prob, anchor_box_pred

    def init_weights(self, pretrained_model_file):
        with open(pretrained_model_file, 'rb') as model_pickle_file:
            fb_model = pickle.load(model_pickle_file)
            # Model has two keys- config and blobs
            fb_model = fb_model['blobs']
        
            self.conv_rpn.weight.data = torch.FloatTensor(fb_model['conv_rpn_w'])
            self.conv_rpn.bias.data = torch.FloatTensor(fb_model['conv_rpn_b'])
            self.rpn_cls_prob.weight.data = torch.FloatTensor(fb_model['rpn_cls_logits_w'])
            self.rpn_cls_prob.bias.data = torch.FloatTensor(fb_model['rpn_cls_logits_b'])
            self.rpn_bbox_pred.weight.data = torch.FloatTensor(fb_model['rpn_bbox_pred_w'])
            self.rpn_bbox_pred.bias.data = torch.FloatTensor(fb_model['rpn_bbox_pred_b'])
        

In [6]:
#ROI_POOLING
class ROI_Pooling(nn.Module):
    def __init__(self, roi_height, roi_width, roi_spatial_scale, roi_sampling_ratio, conv_head_layers, resnet_model):
        super(ROI_Pooling, self).__init__()
        self.roi_height = roi_height
        self.roi_width  = roi_width
        self.roi_spatial_scale = roi_spatial_scale
        self.roi_sampling_ratio = roi_sampling_ratio
        self.conv_head = nn.Sequential(*[getattr(resnet_model, layer) for layer in conv_head_layers]) 
        
    def forward(self, img_features, rois):
        roi_features = RoIAlignFunction.apply(img_features, preprocess_rois(rois), self.roi_height, self.roi_width, self.roi_spatial_scale, self.roi_sampling_ratio)
        
        # compute 1x1 roi features
        roi_features = self.conv_head(roi_features) # 1x1 feature per proposal
        roi_features = roi_features.view(roi_features.size(0),-1)
        
        return roi_features

In [7]:
class RCNN(nn.Module):
    def __init__(self, pretrained_model_file, roi_feature_size, N_classes):
        super(RCNN, self).__init__()
        # What will be the size of roi_feature_channels??
        self.bbox_head=torch.nn.Linear(roi_feature_size, 4*N_classes)
        self.class_prob_head=torch.nn.Linear(roi_feature_size, N_classes)
        
        self.init_weights(pretrained_model_file)
    
    def forward(self, roi_features):
        # compute classification probabilities
        cls_score =  F.softmax(self.class_prob_head(roi_features))

        # compute bounding box parameters 
        bbox_pred = self.bbox_head(roi_features)
        
        return (cls_score,bbox_pred)
    
    def init_weights(self, pretrained_model_file):
        with open(pretrained_model_file, 'rb') as model_pickle_file:
            fb_model = pickle.load(model_pickle_file)
            # Model has two keys- config and blobs
            fb_model = fb_model['blobs']        
 
            self.class_prob_head.weight.data = torch.FloatTensor(fb_model['cls_score_w'])
            self.class_prob_head.bias.data = torch.FloatTensor(fb_model['cls_score_b'])

            self.bbox_head.weight.data = torch.FloatTensor(fb_model['bbox_pred_w'])
            self.bbox_head.bias.data = torch.FloatTensor(fb_model['bbox_pred_b'])

In [8]:
class Detector(nn.Module):
    def __init__(self,
                 backbone_architecture,
                 pretrained_model_file, 
                 resnet_feature_extraction_layers, 
                 feature_extractor_output_channels,
                 rpn_conv_output_channels,
                 number_of_anchors,
                 conv_head_layers,
                 roi_height,
                 roi_width,
                 roi_spatial_scale,
                 roi_sampling_ratio,
                 roi_feature_size,
                 N_classes):
        super(Detector, self).__init__() 
        
        self.backbone_architecture               = backbone_architecture
        self.pretrained_model_file               = pretrained_model_file     
        self.resnet_feature_extraction_layers    = resnet_feature_extraction_layers     
        self.feature_extractor_output_channels   = feature_extractor_output_channels    
        self.rpn_conv_output_channels            = rpn_conv_output_channels    
        self.number_of_anchors                   = number_of_anchors    
        self.conv_head_layers                    = conv_head_layers    
        self.roi_height                          = roi_height    
        self.roi_width                           = roi_width    
        self.roi_spatial_scale                   = roi_spatial_scale    
        self.roi_sampling_ratio                  = roi_sampling_ratio    
        self.roi_feature_size                    = roi_feature_size    
        self.N_classes                           = N_classes

        self.resnet_model = ResnetModel(backbone_architecture = backbone_architecture,
                                        pretrained_model_file= pretrained_model_file,
                                        resnet_feature_extraction_layers= resnet_feature_extraction_layers)
        
        self.rpn = RegionProposalNetwork(pretrained_model_file= pretrained_model_file,
                                         feature_extractor_output_channels= feature_extractor_output_channels,
                                         rpn_conv_output_channels= rpn_conv_output_channels,
                                         number_of_anchors= number_of_anchors)
        self.proposal_generator = GenerateProposals(train=False)
        
        self.roi_pooling = ROI_Pooling(roi_height, roi_width, roi_spatial_scale, roi_sampling_ratio, conv_head_layers, self.resnet_model.resnet_model)
        
        self.rcnn = RCNN(pretrained_model_file, roi_feature_size, N_classes)
       
    def forward(self, image, scaling_factor=None):
        h,w = image.size(2), image.size(3)

        img_features = self.resnet_model(image)
        
        print 'Image features size ' + str(img_features.shape)
        
        print 'Number of channels should match ' + str(self.feature_extractor_output_channels)
        
        rpn_cls_prob, rpn_bbox_pred = self.rpn(img_features)
        
        print 'RPNs class ' + str(rpn_cls_prob.shape)
        print 'RPNs box ' + str(rpn_bbox_pred.shape) 
        
        rois, rpn_roi_probs = self.proposal_generator(rpn_cls_prob, rpn_bbox_pred, h, w, scaling_factor)
        
        print 'ROIS ' + str(rois.shape)
        
        roi_features = self.roi_pooling(img_features, rois)
        
        print 'After ROI Pooing ' + str(roi_features.shape)
        print 'This should match ' + str(self.roi_feature_size) 
        
        cls_score,bbox_pred = self.rcnn(roi_features)
        
        print 'Final class ' + str(cls_score.shape) 
        print 'Final box ' + str(bbox_pred.shape) 
        
        return (cls_score,bbox_pred,rois,img_features,rpn_cls_prob)

In [9]:
model =  Detector(backbone_architecture='resnet50',
                 pretrained_model_file = R_50_C4_PATH, 
                 resnet_feature_extraction_layers = ['conv1','bn1','relu','maxpool','layer1','layer2','layer3'], 
                 feature_extractor_output_channels = 1024,
                 rpn_conv_output_channels = 1024,
                 number_of_anchors = 15,
                 conv_head_layers = ['layer4','avgpool'],
                 roi_height = 14,
                 roi_width = 14,
                 roi_spatial_scale = 0.0625,
                 roi_sampling_ratio = 0,
                 roi_feature_size = 2048,
                 N_classes = 81)

In [10]:
def eval_model(sample):
    class_scores, bbox_deltas, rois, img_features,rpn_cls_prob = model(sample['image'],
                                                         scaling_factor=sample['scaling_factors'].cpu().data.numpy().item())   
    return class_scores,bbox_deltas,rois,img_features,rpn_cls_prob

In [11]:
image_filename = 'demo/33823288584_1d21cf0a26_k.jpg'

# Load image
image = io.imread(image_filename)
orig_im_size = image.shape

# Preprocess image
im_list, im_scales = prep_im_for_blob(image)

# Build sample
sample = {}
sample['image'] = Variable(torch.FloatTensor(im_list[0]).permute(2,0,1).unsqueeze(0))
sample['scaling_factors'] = Variable(torch.FloatTensor([im_scales[0]]))
sample['original_im_size'] = Variable(torch.FloatTensor(orig_im_size))

class_scores,bbox_deltas,rois,img_features,rpn_cls_prob = eval_model(sample)

Image features size torch.Size([1, 1024, 50, 80])
Number of channels should match 1024




RPNs class torch.Size([1, 15, 50, 80])
RPNs box torch.Size([1, 60, 50, 80])
ROIS torch.Size([526, 4])
After ROI Pooing torch.Size([526, 2048])
This should match 2048
Final class torch.Size([526, 81])
Final box torch.Size([526, 324])




In [57]:
# postprocess output:
# - convert coordinates back to original image size, 
# - treshold proposals based on score,
# - do NMS.
scores_final, boxes_final, boxes_per_class = result_utils.postprocess_output(rois,
                                                                            sample['scaling_factors'],
                                                                            sample['original_im_size'],
                                                                            class_scores,
                                                                            bbox_deltas)

In [71]:
sample['image'].shape

torch.Size([1, 3, 800, 1275])

In [69]:
orig_im_size

(600, 956, 3)

In [73]:
sample['scaling_factors']

Variable containing:
 1.3333
[torch.FloatTensor of size 1]

In [12]:
rpn_cls_prob

Variable containing:
(0 ,0 ,.,.) = 
  6.8313e-05  1.1266e-04  2.6658e-05  ...   1.1223e-04  2.2220e-04  1.0581e-04
  2.8226e-05  3.0563e-05  1.3027e-05  ...   1.0622e-04  3.2047e-04  9.7242e-05
  2.2221e-05  4.2975e-05  1.0456e-05  ...   6.9281e-05  1.4257e-04  1.4779e-04
                 ...                   ⋱                   ...                
  1.0765e-05  5.7360e-05  2.5384e-05  ...   2.4575e-05  1.4540e-04  1.8993e-05
  7.0114e-06  9.8308e-05  2.0899e-05  ...   3.3102e-05  8.8451e-05  2.1193e-05
  4.4377e-05  3.3877e-04  3.7163e-05  ...   9.2786e-05  1.6923e-04  4.2703e-05

(0 ,1 ,.,.) = 
  1.6783e-04  6.8259e-05  4.7707e-05  ...   2.8033e-04  2.5687e-04  3.4398e-04
  1.9633e-04  9.8460e-05  1.3850e-04  ...   2.9458e-04  2.7348e-04  3.9299e-04
  4.6621e-05  1.3461e-05  2.5303e-05  ...   2.1738e-04  8.5451e-05  2.3713e-04
                 ...                   ⋱                   ...                
  2.3794e-04  9.4107e-04  2.8001e-03  ...   1.8474e-04  1.3092e-04  9.2709e-05


In [None]:
rois