<a href="https://colab.research.google.com/github/anirudhs001/3D-ObjectDetection/blob/main/stereohrnet_f2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cp -r drive/MyDrive/playground/training . 

cp: cannot stat 'drive/MyDrive/playground/training': No such file or directory


In [None]:
# pip uninstall opencv-contrib-python opencv-python -y
# !pip install opencv-contrib-python
# !pip install fastai==1.0.61
!pip freeze | grep fast*
# !pip3 install 'pillow < 7.0.0'

fastai==1.0.61
fastdtw==0.3.4
fastprogress==1.0.0
fastrlock==0.6


In [None]:
!pwd
!mkdir -p StereoHRnet/models
!cd ../input/pretrained-model-epoch-25/; cp model_epoch_36.pth /kaggle/working/StereoHRnet/models/

/content
/bin/bash: line 0: cd: ../input/pretrained-model-epoch-25/: No such file or directory
cp: cannot stat 'model_epoch_36.pth': No such file or directory



### **TARGET_VECTOR** = [\<one-hot-vector for 9 classes == 9 0/1s\>, \<2D bbox\>, \<dimensions\>, \<location\>, \<rotation_y\>]   
total 20 values in single vector    
each target matrix has 32x32 such vectors

where,

**bbox 2D** = (0-based) bounding box of the object: Left, top, right, bottom image coordinates

**dimensions** = 3D object dimensions: height, width, length [m]

**location** = 3D object location x,y,z in camera coords. [m]

**rotation_y** = Rotation around Y-axis in camera coords. [-Pi; Pi]




# Imports and pre-steps

In [None]:
import torch
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
# from torchvision.transforms.functional import InterpolationMode
import cv2
from cv2 import cvtColor
import imageio as io
import pandas as pd
import numpy as np

In [None]:
import os
def getFiles(filepath):

    left_fold  = 'training/image_2/'
    right_fold = 'training/image_3/'
    labels = 'training/label_2/'
    filenames = [f.split(".")[0] for f in os.listdir(filepath+labels)]
    # print(filenames[:10])

    left_train  = [filepath+left_fold+f"{f}.png" for f in filenames]
    right_train = [filepath+right_fold+f"{f}.png" for f in filenames]
    labels_train = [filepath+labels + f"{f}.txt" for f in filenames]
    
    return left_train, right_train, labels_train

In [None]:

class customDataset(Dataset): 
    def __init__(self, source_dir, transform=None, transform_disp=None, train=False):
        super().__init__()
        self.source_dir = source_dir
        self.left, self.right, self.labels = getFiles(self.source_dir)
        assert len(self.left) == len(self.right) == len(self.labels)
        self.tfms = transform
        self.tfms_disp = transform_disp
        self.train = train
        
        # for disparity calculation
        self.left_matcher = cv2.StereoSGBM_create(
            minDisparity = 0,
            numDisparities= 128, 
            blockSize=15,
            speckleRange=2
        )

        self.baseline = 0.54
        self.focal_length = 721.5377

    def __len__(self):
        return len(self.left) 
    
    def __getitem__(self, idx):
        left_img = io.imread(self.left[idx])
        left_img = cv2.cvtColor(left_img, cv2.COLOR_BGRA2BGR)
        right_img = io.imread(self.right[idx])
        right_img = cv2.cvtColor(right_img, cv2.COLOR_BGRA2BGR)

        ## RGB TO single channel for disparity computation
        monoL = cv2.cvtColor(left_img, cv2.COLOR_BGR2GRAY)
        monoR = cv2.cvtColor(right_img, cv2.COLOR_BGR2GRAY)

        ## Disparity computation
        left_disp = self.left_matcher.compute(monoL, monoR)

        ## Depth       
        depth = (self.baseline * self.focal_length) / (left_disp)
        depth = np.clip(depth, 0, 10)
        depth = np.array(depth, dtype=np.float32)

        labels = pd.read_csv(
            self.labels[idx],
            names=["class", "trunc", "occ", "alpha", "left", "top", "right", "bottom", "height", "width", "depth", "locx", "locy", "locz", "rot", "score"], 
            sep=" "
        )
        
        # print(labels)
        if self.train is True:
            labels = labels[['class',  'left',  'top',  'right',  'bottom',  'height',  'width',  'depth',  'locx',  'locy',  'locz',  'rot', "alpha"]]
            for c in classes:
                labels.insert(0, c, 0.)
                for i in labels.index:
                    if labels.loc[i, "class"] == c:
                        labels.loc[i, c] = 1.
            labels = labels.drop('class', axis=1)
            labels = labels.to_numpy()
            # print(labels.shape)
            ## scale labels to resize image size
            labels[:,9] = labels[:,9]  
            labels[:,11] = labels[:,11]
            labels[:,10] = labels[:,10]
            labels[:,12] = labels[:,12]

            targ = torch.zeros((32,32,21))
            x_center = ((labels[:,9] + labels[:,11])/2)*256/org_w # x_center of 2d bounding box
            y_center = ((labels[:,10] + labels[:,12])/2)*256/org_h # y_center of 2d bounding box

            grid_size = 256 / 32 # number of cells in 
            anchor_x = np.clip(np.array(x_center/grid_size).astype(int), 0, 31) # horizontal index where label is stored
            anchor_y = np.clip(np.array(y_center/grid_size).astype(int), 0, 31) # vertical index where label is stored

            for i, (y, x) in enumerate(zip(anchor_y, anchor_x)):
                targ[y,x] = torch.from_numpy(labels[i])
            
            targ = targ.permute(2,0,1)

            if self.tfms is not None:
                left_img = self.tfms(left_img)
                right_img = self.tfms(right_img)
                depth = self.tfms_disp(depth)
                img_tensor = torch.cat((left_img, right_img, depth), 0)
                # print(img_tensor.shape)
                # img_tensor = img_tensor.view(6,370,1224)
                # targ = self.tfms(targ)
                
                return img_tensor, targ
            return left_img, right_img, depth, targ
        return left_img, right_img, labels

In [None]:
classes = ['Car', 'Van',  'Truck', 'Pedestrian',  'Person_sitting', 'Cyclist',  'Tram', 'Misc', 'DontCare']
print(len(classes))

9


In [None]:
# IMAGE SIZE
org_h, org_w = 375, 1242
# org_h, org_w = 370, 1224

In [None]:

load_height = 256
load_width = 256

tfms_train = transforms.Compose([    
    transforms.ToPILImage(),
    transforms.Resize((load_height, load_width)),
    transforms.ToTensor(),
    # transforms.Normalize(mean = [0.5, 0.5, 0.5], std = [0.5, 0.5, 0.5])                
])

tfms_depth_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((load_height, load_width)),
])


# Models

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

## HRnet 

### HRnet base

In [None]:
class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=0.1)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=0.1)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=0.1)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=0.1)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=0.1)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out
    
class HighResolutionModule(nn.Module):
    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
                 num_channels, fuse_method, multi_scale_output=True):
        super().__init__()
        self._check_branches(
            num_branches, blocks, num_blocks, num_inchannels, num_channels)

        self.num_inchannels = num_inchannels
        self.fuse_method = fuse_method
        self.num_branches = num_branches

        self.multi_scale_output = multi_scale_output

        self.branches = self._make_branches(
            num_branches, blocks, num_blocks, num_channels)
        self.fuse_layers = self._make_fuse_layers()
        self.relu = nn.ReLU(inplace=False)

    def _check_branches(self, num_branches, blocks, num_blocks,
                        num_inchannels, num_channels):
        if num_branches != len(num_blocks):
            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
                num_branches, len(num_blocks))
            logger.error(error_msg)
            raise ValueError(error_msg)

        if num_branches != len(num_channels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
                num_branches, len(num_channels))
            logger.error(error_msg)
            raise ValueError(error_msg)

        if num_branches != len(num_inchannels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
                num_branches, len(num_inchannels))
            logger.error(error_msg)
            raise ValueError(error_msg)

    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
                         stride=1):
        downsample = None
        if stride != 1 or \
           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.num_inchannels[branch_index],
                          num_channels[branch_index] * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(num_channels[branch_index] * block.expansion,
                            momentum=0.1),
            )

        layers = []
        layers.append(block(self.num_inchannels[branch_index],
                            num_channels[branch_index], stride, downsample))
        self.num_inchannels[branch_index] = \
            num_channels[branch_index] * block.expansion
        for i in range(1, num_blocks[branch_index]):
            layers.append(block(self.num_inchannels[branch_index],
                                num_channels[branch_index]))

        return nn.Sequential(*layers)

    def _make_branches(self, num_branches, block, num_blocks, num_channels):
        branches = []

        for i in range(num_branches):
            branches.append(
                self._make_one_branch(i, block, num_blocks, num_channels))

        return nn.ModuleList(branches)

    def _make_fuse_layers(self):
        if self.num_branches == 1:
            return None

        num_branches = self.num_branches
        num_inchannels = self.num_inchannels
        fuse_layers = []
        for i in range(num_branches if self.multi_scale_output else 1):
            fuse_layer = []
            for j in range(num_branches):
                if j > i:
                    fuse_layer.append(nn.Sequential(
                        nn.Conv2d(num_inchannels[j],
                                  num_inchannels[i],
                                  1,
                                  1,
                                  0,
                                  bias=False),
                        nn.BatchNorm2d(num_inchannels[i], momentum=0.1)))
                elif j == i:
                    fuse_layer.append(None)
                else:
                    conv3x3s = []
                    for k in range(i-j):
                        if k == i - j - 1:
                            num_outchannels_conv3x3 = num_inchannels[i]
                            conv3x3s.append(nn.Sequential(
                                nn.Conv2d(num_inchannels[j],
                                          num_outchannels_conv3x3,
                                          3, 2, 1, bias=False),
                                nn.BatchNorm2d(num_outchannels_conv3x3, 
                                            momentum=0.1)))
                        else:
                            num_outchannels_conv3x3 = num_inchannels[j]
                            conv3x3s.append(nn.Sequential(
                                nn.Conv2d(num_inchannels[j],
                                          num_outchannels_conv3x3,
                                          3, 2, 1, bias=False),
                                nn.BatchNorm2d(num_outchannels_conv3x3,
                                            momentum=0.1),
                                nn.ReLU(inplace=False)))
                    fuse_layer.append(nn.Sequential(*conv3x3s))
            fuse_layers.append(nn.ModuleList(fuse_layer))

        return nn.ModuleList(fuse_layers)

    def get_num_inchannels(self):
        return self.num_inchannels

    def forward(self, x):
        if self.num_branches == 1:
            return [self.branches[0](x[0])]

        for i in range(self.num_branches):
            x[i] = self.branches[i](x[i])

        x_fuse = []
        for i in range(len(self.fuse_layers)):
            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
            for j in range(1, self.num_branches):
                if i == j:
                    y = y + x[j]
                elif j > i:
                    width_output = x[i].shape[-1]
                    height_output = x[i].shape[-2]
                    y = y + F.interpolate(
                        self.fuse_layers[i][j](x[j]),
                        size=[height_output, width_output],
                        mode='bilinear',
                        align_corners=True)
                else:
                    y = y + self.fuse_layers[i][j](x[j])
            x_fuse.append(self.relu(y))

        return x_fuse



### Regular HRnet

In [None]:
blocks_dict = {
    'BASIC': BasicBlock,
    'BOTTLENECK': Bottleneck
}
class HighResolutionNet(nn.Module):

    def __init__(self):
        super().__init__()

        # stem net
        self.conv1 = nn.Conv2d(7, 64, kernel_size=5, stride=1, padding=2,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64, momentum=0.1)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1,
                               bias=False)
        self.bn2 = nn.BatchNorm2d(64, momentum=0.1)
        self.relu = nn.ReLU(inplace=False)

        self.stage1_cfg = {
            'NUM_MODULES' : 1,
            'NUM_BRANCHES' : 1,
            'NUM_BLOCKS' : [4],
            'NUM_CHANNELS' : [16],
            'BLOCK' : 'BOTTLENECK',
            'FUSE_METHOD' : 'SUM',

        }
        num_channels = self.stage1_cfg['NUM_CHANNELS'][0]
        block = blocks_dict[self.stage1_cfg['BLOCK']]
        num_blocks = self.stage1_cfg['NUM_BLOCKS'][0]
        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
        stage1_out_channel = block.expansion*num_channels

        self.stage2_cfg = {
            'NUM_MODULES' : 1,
            'NUM_BRANCHES' : 2,
            'NUM_BLOCKS' : [4, 4],
            'NUM_CHANNELS' : [16, 32],
            'BLOCK' : 'BASIC',
            'FUSE_METHOD' : 'SUM',

        }
        num_channels = self.stage2_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage2_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition1 = self._make_transition_layer(
            [stage1_out_channel], num_channels)
        self.stage2, pre_stage_channels = self._make_stage(
            self.stage2_cfg, num_channels)

        self.stage3_cfg = {
            'NUM_MODULES' : 1,
            'NUM_BRANCHES' : 3,
            'NUM_BLOCKS' : [4, 4, 4],
            'NUM_CHANNELS' : [16, 32, 64],
            'BLOCK' : 'BASIC',
            'FUSE_METHOD' : 'SUM',

        }
        num_channels = self.stage3_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage3_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition2 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels)
        self.stage4_cfg = {
            'NUM_MODULES' : 1,
            'NUM_BRANCHES' : 4,
            'NUM_BLOCKS' : [4, 4, 4, 4],
            'NUM_CHANNELS' : [16, 32, 64, 128],
            'BLOCK' : 'BASIC',
            'FUSE_METHOD' : 'SUM',

        }
        num_channels = self.stage4_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage4_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition3 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=True)

        last_inp_channels = int(np.sum(pre_stage_channels))
        last_out_channels = 21
        self.second_last_layer = nn.Sequential(
            nn.MaxPool2d(kernel_size=4),
            nn.Conv2d(
                in_channels=last_inp_channels,
                out_channels=last_inp_channels,
                kernel_size=3,
                stride=1,
                padding=1),
            nn.BatchNorm2d(last_inp_channels, momentum=0.1),
            nn.ReLU(inplace=False),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(
                in_channels=last_inp_channels,
                out_channels=last_inp_channels,
                kernel_size=1)
            
        )
        self.last_layer = nn.Sequential(
            nn.Linear(last_inp_channels, 1024),
            nn.ReLU(),
            nn.Linear(1024, last_out_channels),
        )
        self.prob_layer = nn.Sigmoid()

    def _make_transition_layer(
            self, num_channels_pre_layer, num_channels_cur_layer):
        num_branches_cur = len(num_channels_cur_layer)
        num_branches_pre = len(num_channels_pre_layer)

        transition_layers = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(nn.Sequential(
                        nn.Conv2d(num_channels_pre_layer[i],
                                  num_channels_cur_layer[i],
                                  3,
                                  1,
                                  1,
                                  bias=False),
                        nn.BatchNorm2d(
                            num_channels_cur_layer[i], momentum=0.1),
                        nn.ReLU(inplace=False)))
                else:
                    transition_layers.append(None)
            else:
                conv3x3s = []
                for j in range(i+1-num_branches_pre):
                    inchannels = num_channels_pre_layer[-1]
                    outchannels = num_channels_cur_layer[i] \
                        if j == i-num_branches_pre else inchannels
                    conv3x3s.append(nn.Sequential(
                        nn.Conv2d(
                            inchannels, outchannels, 3, 2, 1, bias=False),
                        nn.BatchNorm2d(outchannels, momentum=0.1),
                        nn.ReLU(inplace=False)))
                transition_layers.append(nn.Sequential(*conv3x3s))

        return nn.ModuleList(transition_layers)

    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion, momentum=0.1),
            )

        layers = []
        layers.append(block(inplanes, planes, stride, downsample))
        inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(inplanes, planes))

        return nn.Sequential(*layers)

    def _make_stage(self, layer_config, num_inchannels,
                    multi_scale_output=True):
        num_modules = layer_config['NUM_MODULES']
        num_branches = layer_config['NUM_BRANCHES']
        num_blocks = layer_config['NUM_BLOCKS']
        num_channels = layer_config['NUM_CHANNELS']
        block = blocks_dict[layer_config['BLOCK']]
        fuse_method = layer_config['FUSE_METHOD']

        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True
            modules.append(
                HighResolutionModule(num_branches,
                                      block,
                                      num_blocks,
                                      num_inchannels,
                                      num_channels,
                                      fuse_method,
                                      reset_multi_scale_output)
            )
            num_inchannels = modules[-1].get_num_inchannels()

        return nn.Sequential(*modules), num_inchannels

    def forward(self, x ):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer1(x)

        x_list = []
        for i in range(2):
            if self.transition1[i] is not None:
                x_list.append(self.transition1[i](x))
            else:
                x_list.append(x)
        y_list = self.stage2(x_list)

        x_list = []
        for i in range(3):
            if self.transition2[i] is not None:
                if i < 2:
                    x_list.append(self.transition2[i](y_list[i]))
                else:
                    x_list.append(self.transition2[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage3(x_list)

        x_list = []
        for i in range(self.stage4_cfg['NUM_BRANCHES']):
            if self.transition3[i] is not None:
                if i < self.stage3_cfg['NUM_BRANCHES']:
                    x_list.append(self.transition3[i](y_list[i]))
                else:
                    x_list.append(self.transition3[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        x = self.stage4(x_list)

#         # Upsampling
        x0_h, x0_w = x[0].size(2), x[0].size(3)

        x1 = F.interpolate(x[1], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
        x2 = F.interpolate(x[2], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
        x3 = F.interpolate(x[3], size=(x0_h, x0_w), mode='bilinear', align_corners=True)
        x = torch.cat([x[0], x1, x2, x3], 1)

        x = self.second_last_layer(x)
#         print(x.shape)
        x = x.permute(0,2,3,1)
#         print(x.shape)
        x = self.last_layer(x)
#         print(x.shape)
        x = x.permute(0,3,1,2)
#         print(x.shape)
        x = torch.cat((self.prob_layer(x[:,:9,:,:]), x[:,9:,:,:]), dim=1)
        return x


In [None]:
## sanity check for model output
def test():
    model = HighResolutionNet()
    inp = torch.rand((1,7,256,256)) * 1e4
    out = model(inp)
    print(type(out))
    print(out.shape)
    print(out[0,:,0,0])
    del model
test()

<class 'torch.Tensor'>
torch.Size([1, 21, 32, 32])
tensor([ 0.5213,  0.4495,  0.4881,  0.4778,  0.5833,  0.4631,  0.4816,  0.5363,
         0.5750, -0.0445, -0.1609,  0.1102,  0.0632,  0.0185, -0.0597, -0.1239,
        -0.1185, -0.2164, -0.3708,  0.3068, -0.2041], grad_fn=<SelectBackward>)


# Loss

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  
print(dev)

cuda:0


In [None]:
MSE_loss = nn.MSELoss()
BCE_loss = nn.BCELoss()

In [None]:

def loss_func(pred, targ):
    loss = 0.
    # print(pred.shape)
    # print(targ.shape)
    
    # locn = locn.squeeze()
    # print(locn.shape)
    # print(locn)

    for b in range(targ.shape[0]):
        # 1. loss for all probabilities
        # BCE loss is very small compared to MSE. so scale up appropriately
        # Also if mse loss > bce loss, net will make mse loss 0 by outputing all probs = 0. hence won't learn anything
        loss += 1e6 * BCE_loss(pred[:,:9,:,:], targ[:,:9,:,:])
        
        # 2. all boxes where prob = 1
        mse = 0.
        for i in range(targ.shape[2]):
            for j in range(targ.shape[3]):
                if torch.sum(targ[b,:9,i,j]).item() > 1e-2 :
                    loss += MSE_loss(pred[b,9:,i,j], targ[b,9:,i,j]) 
    return loss

## test loss func
a = torch.rand((16,21,32,32))
b = torch.rand((16,21,32,32))
print(loss_func(a,b).item())

15974813.0


# TRAIN with Fastai 

In [None]:
# Fastai instead
from fastai import *
from fastai.vision import *
from fastai.vision.data import *
# from fastai.vision.core import *
from fastai.vision.data import DataLoader as FastLoader

In [None]:
!nvidia-smi

Wed Jun 16 06:57:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P8    21W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# new dataset with train and validation split for fastai
class FastDataset(customDataset):
    def __init__(self, train=False, split_pct=0.8, source_dir=None, tfms=None, tfms_disp=None):
        super().__init__(source_dir, transform=tfms, transform_disp=tfms_disp, train=True)

        split = int(len(self) * split_pct)
        if train==True:
            self.left = self.left[:split]
            self.right = self.right[:split]
            self.labels = self.labels[:split]
        else:
            self.left = self.left[-split:]
            self.right = self.right[-split:]
            self.labels = self.labels[-split:]

In [None]:
PATH = ""
train_ds = FastDataset(
    train=True, 
    split_pct=0.95, 
    source_dir=PATH, 
    tfms=tfms_train, 
    tfms_disp=tfms_depth_train
)
valid_ds = FastDataset(
    train=False, 
    split_pct=0.05, 
    source_dir=PATH, 
    tfms=tfms_train, 
    tfms_disp=tfms_depth_train
)
train_loader = FastLoader(train_ds, batch_size=4, shuffle=True, num_workers = 2, pin_memory=False)
valid_loader = FastLoader(valid_ds, batch_size=4, shuffle=True, num_workers = 2, pin_memory=False)


db = DataBunch(train_dl=train_loader, valid_dl=valid_loader)


In [None]:
import imageio as io

In [None]:
batch = next(iter(db.valid_dl))
inp, targ = batch
print(inp.shape)
print(targ.shape)
print(len(batch))
print(len(db.valid_ds))
print(len(db.train_ds))



torch.Size([16, 7, 256, 256])
torch.Size([16, 21, 32, 32])
2
374
7106



 TARGET_VECTOR = [<one-hot-vector for 9 classes == 9 0/1s>, left, top, bottom, right, ...]   
total 20 values in single vector    
each target matrix has 32x32 such vectors

In [None]:

inp = batch[0][1]
targ = batch[1][1]
print(targ.shape)
left = inp[:3].cpu()
right = inp[3:].cpu()
fig, ax = plt.subplots(1, 2, figsize=(10,10) )
ax[0].imshow(left.permute(1,2,0))
ax[1].imshow(right.permute(1,2,0))

## check target
for i in range(targ.shape[1]):
    for j in range(targ.shape[2]):
        if torch.sum(targ[:9,i,j])!=0:
            print(targ[:9,i,j])
            x_targ = targ[9,i,j]
            y_targ = targ[10,i,j]
            w_targ = targ[11,i,j] - targ[9,i,j]
            h_targ = targ[12,i,j] - targ[10,i,j]
            x_targ = x_targ *256/org_w
            y_targ = y_targ *256/org_h
            w_targ = w_targ *256/org_w
            h_targ = h_targ *256/org_h
            # print(targ[i,j,12], targ[i,j,10], targ[i,j,12] - targ[i,j,10])
            rect_targ = patches.Rectangle((x_targ,y_targ), w_targ, h_targ, linewidth=1, edgecolor='r', facecolor='none')
            # targ_boxes.append(rect)
            ax[0].add_patch(rect_targ)

In [None]:

path_to_save = "StereoHRnet/"
# !mkdir {path_to_save}
! cd {path_to_save}; ls

models


In [None]:
from fastai import callbacks
learn = Learner(data=db, model=HighResolutionNet(),opt_func=optim.Adam, loss_func=loss_func, path=path_to_save)
print(learn.data.device)
Callbacks = [callbacks.SaveModelCallback(learn, every="epoch", name="model_epoch")]

learn.callbacks = Callbacks

cuda


In [None]:
learn.load("model_epoch_15");

In [None]:
learn.lr_find(start_lr = 1e-10, end_lr = 1e2, num_it=50)

In [None]:
learn.recorder.plot()

In [None]:

inp, targ = batch
with torch.no_grad():
    inp = inp
    targ = targ
    pred = learn.model(inp)
    loss = learn.loss_func(pred, targ)
    print(loss)

tensor(1.1479e+08, device='cuda:0')


In [None]:
# learn.fit(4, 1e-5)
with np.errstate(divide='ignore'):
    learn.fit_one_cycle(40, max_lr = 1e-2, start_epoch=16)

Loaded model_epoch_15


epoch,train_loss,valid_loss,time


RuntimeError: ignored

In [None]:
"hello"

# inference

In [None]:
net = Net().to(dev)


In [None]:

params = torch.load(f"{path_to_save}models/bestmodel.pth")
# params['model']


In [None]:

net.load_state_dict(params['model'])

In [None]:

inp = batch[0][2]
targ = batch[1][2]
print(targ.shape)
left = inp[:3].cpu()
right = inp[3:6].cpu()
depth = inp[6:].cpu()
fig, ax = plt.subplots(2, 2, figsize=(10,10))
ax[0,0].imshow(left.permute(1,2,0))
ax[0,1].imshow(right.permute(1,2,0))
ax[1,0].imshow(depth.unsqueeze(0))

with torch.no_grad():
    pred = learn.model(inp.unsqueeze(0)).squeeze().cpu()
    
## check target
for i in range(targ.shape[1]):
    for j in range(targ.shape[2]):
        if torch.sum(targ[:9,i,j])!=0:
#             print(targ[:,i,j])
            x_targ = targ[9,i,j]
            y_targ = targ[10,i,j]
            w_targ = targ[11,i,j] - targ[9,i,j]
            h_targ = targ[12,i,j] - targ[10,i,j]
            x_targ = x_targ *256/org_w
            y_targ = y_targ *256/org_h
            w_targ = w_targ *256/org_w
            h_targ = h_targ *256/org_h
            # print(targ[i,j,12], targ[i,j,10], targ[i,j,12] - targ[i,j,10])
            rect_targ = patches.Rectangle((x_targ,y_targ), w_targ, h_targ, linewidth=1, edgecolor='r', facecolor='none')
            # targ_boxes.append(rect)
            ax[0,0].add_patch(rect_targ)
            
            # PREDICTIONS
#             print(pred[:,i,j])
            x_pred = pred[9,i,j]
            y_pred = pred[10,i,j]
            w_pred = pred[11,i,j] - pred[9,i,j]
            h_pred = pred[12,i,j] - pred[10,i,j]
            x_pred = x_pred *256/org_w
            y_pred = y_pred *256/org_h
            w_pred = w_pred *256/org_w
            h_pred = h_pred *256/org_h
            # print(targ[i,j,12], targ[i,j,10], targ[i,j,12] - targ[i,j,10])
            rect_pred = patches.Rectangle((x_pred,y_pred), w_pred, h_pred, linewidth=1, edgecolor='r', facecolor='none')
            # targ_boxes.append(rect)
            ax[1,0].add_patch(rect_pred)

In [None]:
   
fig, ax = plt.subplots(1,2, figsize=(10,10) )
ax[0].imshow(left.permute(1,2,0))
ax[1].imshow(right.permute(1,2,0))
for i in range(pred.shape[1]):
    for j in range(pred.shape[2]):
        p = [p>1e-2 for p in pred[:9,i,j]]

        if sum(p) > 0:
            
            # PREDICTIONS
#             print(pred[:9,i,j])
            x_pred = pred[9,i,j]
            y_pred = pred[10,i,j]
            w_pred = pred[11,i,j] - pred[9,i,j]
            h_pred = pred[12,i,j] - pred[10,i,j]
            x_pred = x_pred *256/org_w
            y_pred = y_pred *256/org_h
            w_pred = w_pred *256/org_w
            h_pred = h_pred *256/org_h
            # print(targ[i,j,12], targ[i,j,10], targ[i,j,12] - targ[i,j,10])
            rect_pred = patches.Rectangle((x_pred,y_pred), w_pred, h_pred, linewidth=1, edgecolor='r', facecolor='none')
            # targ_boxes.append(rect)
            ax[0].add_patch(rect_pred)

# Train some more

In [None]:
# learn.fit(4, 1e-5)
learn.fit(2, lr = 1e-4)

# MISC

In [None]:
!nvidia-smi