In [2]:
import torch 
import torch.nn as nn

class SkipBlock(nn.Module):
    """
    This is a building-block class that I have used in several networks
    """
    def __init__(self, in_ch, out_ch, downsample=False, skip_connections=True):
        super(SkipBlock, self).__init__()
        self.downsample = downsample
        self.skip_connections = skip_connections
        
        self.in_ch = in_ch
        self.out_ch = out_ch

        self.convo1 = nn.Conv2d(in_ch, out_ch, 3, stride=1, padding=1)
        self.convo2 = nn.Conv2d(in_ch, out_ch, 3, stride=1, padding=1)

        self.bn1 = nn.BatchNorm2d(out_ch)
        self.bn2 = nn.BatchNorm2d(out_ch)
        if downsample:
            self.downsampler = nn.Conv2d(in_ch, out_ch, 1, stride=2)
    def forward(self, x):
        identity = x                                     
        out = self.convo1(x)                              
        out = self.bn1(out)                              
        out = torch.nn.functional.relu(out)
        if self.in_ch == self.out_ch:
            out = self.convo2(out)                              
            out = self.bn2(out)                              
            out = torch.nn.functional.relu(out)
        if self.downsample:
            out = self.downsampler(out)
            identity = self.downsampler(identity)
        if self.skip_connections:
            if self.in_ch == self.out_ch:
                return out + identity                             
            else:
                out = torch.cat((out[:, :self.in_ch, :, :] + identity, out[:, self.in_ch:, :, :] + identity), dim=1)
                return out

class NetForYolo(nn.Module):
    """
    Recall that each YOLO vector is of size 5+C where C is the number of classes.  Since C
    equals 3 for the dataset used in the demo code in the Examples directory, our YOLO vectors
    are 8 elements long.  A YOLO tensor is a tensor representation of all the YOLO vectors
    created for a given training image.  The network shown below assumes that the input to
    the network is a flattened form of the YOLO tensor.  With an 8-element YOLO vector, a
    6x6 gridding of an image, and with 5 anchor boxes for each cell of the grid, the 
    flattened version of the YOLO tensor would be of size 1440.

    In Version 2.0.6 of the RPG module, I introduced a new loss function for this network
    that calls for using nn.CrossEntropyLoss for just the last C elements of each YOLO
    vector. [See Lines 64 through 83 of the code for "run_code_for_training_multi_instance_
    detection()" for how the loss is calculated in 2.0.6.]  Using nn.CrossEntropyLoss 
    required augmenting the last C elements of the YOLO vector with one additional 
    element for the purpose of representing the absence of an object in any given anchor
    box of a cell.  

    With the above mentioned augmentation, the flattened version of a YOLO tensor is
    of size 1620.  That is the reason for the one line change at the end of the 
    constructor initialization code shown below.
    """ 
    def __init__(self, skip_connections=True, depth=8):
        super(NetForYolo, self).__init__()
        # if depth not in [8,10,12,14,16]:
        #     sys.exit("This network has only been tested for 'depth' values 8, 10, 12, 14, and 16")
        self.depth = depth // 2
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
        self.conv2 = nn.Conv2d(64, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.bn1  = nn.BatchNorm2d(64)
        self.bn2  = nn.BatchNorm2d(128)

        self.skip64_arr = nn.ModuleList()
        for i in range(self.depth):
            self.skip64_arr.append(SkipBlock(64, 64, skip_connections=skip_connections))
        self.skip64ds =  SkipBlock(64,64, downsample=True, skip_connections=skip_connections)
        self.skip64to128 =  SkipBlock(64, 128, skip_connections=skip_connections )
        self.skip128_arr = nn.ModuleList()
        for i in range(self.depth):
            self.skip128_arr.append( SkipBlock(128,128, skip_connections=skip_connections))
        self.skip128ds =  SkipBlock(128,128, downsample=True, skip_connections=skip_connections)
        self.fc_seqn = nn.Sequential(
            nn.Linear(128*16*16, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 64*5*9),
            # nn.ReLU(inplace=True),
#                    nn.Linear(2048, 1440)
            # nn.Linear(2048, 1620)
        )

    def forward(self, x):
        x = self.pool(torch.nn.functional.relu(self.conv1(x)))          
        x = nn.MaxPool2d(2,2)(torch.nn.functional.relu(self.conv2(x)))       
        for i,skip64 in enumerate(self.skip64_arr[:self.depth//4]):
            x = skip64(x)                
        x = self.skip64ds(x)
        for i,skip64 in enumerate(self.skip64_arr[self.depth//4:]):
            x = skip64(x)                
        x = self.bn1(x)
        x = self.skip64to128(x)
        for i,skip128 in enumerate(self.skip128_arr[:self.depth//4]):
            x = skip128(x)                
        x = self.bn2(x)
        x = self.skip128ds(x)
        # print("first:", x.shape)
        x = x.view(-1,128*16*16)
        # print("second:", x.shape)
        x = self.fc_seqn(x)
        # print("third", x.shape)
        return x
    
ynet = NetForYolo(depth=2)

# torchsummary.summary(ynet, (3, 256, 256))
# len(ynet.parameters())
# print(len(list(ynet.parameters())))

# test = torch.randn(4, 3, 256, 256)
# out = ynet(test)
# out.shape

In [70]:
import pickle
import os
import cv2

def clip_file_names(a, location='mac', path='/Users/akshita/Documents/Acads/data/coco_custom_HW6'):
    if location == 'vm':
        return a
    else:
        img_path = a.split('/')[-3:]
        return(os.path.join(path, *img_path))

def plot_img(file, bboxes, category, name=test):
    img = cv2.imread(clip_file_names(file))
    for i, bbox in enumerate(bboxes):
        x, y, w, h = bbox
        img = cv2.circle(img, (bbox[2], bbox[3]), radius=1, color=(0, 225, 0), thickness=1)
        # img = cv2.rectangle(
        #     img, (x, y), (x + w, y + h), color=(36, 255, 0), thickness=2
        # )
        img = cv2.putText(
            img,
            org=(bbox[2], bbox[3]),
            text= str(bbox[0]) + '_' + str(bbox[1]),
            color=(255, 0, 0),
            fontFace=cv2.FONT_HERSHEY_SIMPLEX,
            fontScale=0.2,
            thickness=1,
        )

    cv2.imwrite(f"{name}.png", img)

categories = ["bus", "cat", "pizza"]
total_train = []
total_val = []
for category in categories:
    with open(f'./manifests/train_manifest_{category}.pkl', 'rb') as handle:
        train_data = pickle.load(handle)
    total_train.append(train_data)
    handle.close()

    with open(f'./manifests/val_manifest_{category}.pkl', 'rb') as handle:
        train_data = pickle.load(handle)
    total_val.append(train_data)
    handle.close()

# for i in range(3):
#     print(len(total_train[i]), len(total_val[i]))

bbox_count = [(0, 0) for _ in range(3)]
for i in range(3):
    for j in range(len(total_train[i])):
        b_c = len(total_train[i][j]['bboxes'])
        if b_c > bbox_count[i][1]:
            bbox_count[i] = (j, b_c)
# bbox_count
bboxes = []
files = []
for i, j in enumerate(bbox_count):
    bboxes.append(total_train[i][j[0]]['bboxes'])
    files.append(total_train[i][j[0]]['file_name'])

In [153]:
a = torch.rand((2, 2, 8))
b = torch.zeros((*a.shape[:-1], 1))
c = torch.cat((a, b), dim=-1)

# c[c[:, :, 0] < 1][:, -1]
for i in range(c.shape[0]):
    for j in range(c.shape[1]):
        if c[i, j, 0] < 0.5:
            c[i, j, -1] = 1
c

tensor([[[0.0327, 0.7552, 0.5780, 0.2596, 0.7384, 0.1397, 0.0993, 0.6724,
          1.0000],
         [0.9705, 0.5033, 0.0844, 0.3572, 0.6856, 0.2572, 0.1948, 0.2532,
          0.0000]],

        [[0.0178, 0.5296, 0.5119, 0.3904, 0.4527, 0.3145, 0.5785, 0.9956,
          1.0000],
         [0.1696, 0.0898, 0.7177, 0.1993, 0.3338, 0.9441, 0.7565, 0.8864,
          1.0000]]])

In [175]:
from dataset import MyDataset
from torch.utils.data import DataLoader

categories = ["bus", "cat", "pizza"]

batch_size = 4
# train_data = MyDataset(categories=categories, split='train', manifest_path='./manifests', mac=True)
val_data = MyDataset(categories=categories, split='val', manifest_path='./manifests', mac=True)

# train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

def validation(val_loader, ynet, yolo_interval=32):
    device = "cuda" if torch.cuda.is_available() == True else "cpu"
    print(device)
    ynet = ynet.to(device)
    ynet.eval()
    predicted_bboxes = []
    total_true_bboxes = []
    total_val_classes = []
    true_val_classes = []
    with torch.no_grad():
        for data in val_loader:
            inp, t_l, true_bboxes = data
            inp = inp.to(device)

            out = ynet(inp)
            out = out.view(-1, 64, 5, 9)

            # get cells with top 5 highest values in the first element of the predicted yolo_vectors
            # to achieve this, first we get the max value of anchor boxes for each cell
            # then we sort the values in descending order and get the top 5 cells
            pred_vals = out[:, :, :, 0]
            pred_vals, _ = torch.max(pred_vals, dim=-1)
            sorted_cells = torch.argsort(pred_vals, descending=True, dim=-1)

            top5preds = torch.zeros((out.shape[0], 64, 9))

            for i in range(out.shape[0]):
                each_batch = out[i, sorted_cells[i]]
                for j in range(each_batch.shape[0]):
                    temp = each_batch[j, :, 0]
                    args = torch.argmax(temp, dim=-1)
                    top5preds[i, j] = each_batch[j, args]

            pred_classes = top5preds[:, :, 5:-1]
            pred_classes = nn.Softmax(dim=1)(pred_classes)
            pred_classes = torch.argmax(pred_classes, dim=-1)

            pred_regression_vec = top5preds[:, :, 1:5]
            del_x, del_y = pred_regression_vec[:, :, 0], pred_regression_vec[:, :, 1]
            h, w = pred_regression_vec[:, :, 2], pred_regression_vec[:, :, 3]

            h *= yolo_interval
            w *= yolo_interval
            cell_row_index = torch.div(sorted_cells, 8, rounding_mode="floor")
            cell_col_index = sorted_cells % 8

            yolo_offset = torch.ones_like(cell_row_index) * (yolo_interval / 2)
            bb_center_x = (
                cell_col_index * yolo_interval + yolo_offset + del_x * yolo_interval
            )
            bb_center_y = (
                cell_row_index * yolo_interval + yolo_offset + del_y * yolo_interval
            )

            bb_top_left_x = bb_center_x - torch.div(w, 2, rounding_mode="floor")
            bb_top_left_y = bb_center_y - torch.div(h, 2, rounding_mode="floor")

            valid_preds = []
            val_classes = []
            for i in range(bb_top_left_x.shape[0]):
                for j in range(bb_top_left_x.shape[1]):
                    if bb_top_left_x[i, j] < 0:
                        bb_top_left_x[i, j] = 0
                    if bb_top_left_y[i, j] < 0:
                        bb_top_left_y[i, j] = 0
                    if (
                        (h[i, j] > 256).any()
                        or (w[i, j] > 256).any()
                        or (w < 64).any()
                        or (h < 64).any()
                    ):
                        continue
                    valid_preds.append(
                        torch.tensor(
                            [bb_top_left_x[i, j], bb_top_left_y[i, j], w[i, j], h[i, j]]
                        )
                    )
                    val_classes.append(pred_classes[i, j])

            if len(valid_preds):
                valid_preds = torch.stack(valid_preds, dim=-1)
                predicted_bboxes.append(valid_preds)
                total_true_bboxes.append(true_bboxes)
                total_val_classes.append(val_classes)
                true_val_classes.append(t_l)

    if predicted_bboxes:
        predicted_bboxes = torch.cat(predicted_bboxes, dim=-1)

    return predicted_bboxes, total_true_bboxes, total_val_classes, true_val_classes

predicted_bboxes = validation(val_loader, ynet)
predicted_bboxes

cpu
torch.Size([4, 64])


In [133]:
a = torch.rand((4, 5, 5, 9))
b = a[:, :, :, 0]
print(b.shape)
c = torch.argmax(b, dim=-1)
d = a[0, c[0]]
for i in range(5):
    temp = d[i, :, 0]
    args = torch.argmax(temp, dim=-1)
    # print(args)
    print(d[i, args])

torch.Size([4, 5, 5])
tensor([0.9274, 0.3575, 0.8599, 0.5772, 0.3570, 0.7029, 0.2207, 0.2920, 0.0407])
tensor([0.9274, 0.3575, 0.8599, 0.5772, 0.3570, 0.7029, 0.2207, 0.2920, 0.0407])
tensor([0.8180, 0.5276, 0.2137, 0.0457, 0.5645, 0.6096, 0.5926, 0.7152, 0.2464])
tensor([0.8182, 0.0585, 0.9665, 0.9702, 0.4519, 0.2962, 0.7903, 0.0746, 0.4805])
tensor([0.8182, 0.0585, 0.9665, 0.9702, 0.4519, 0.2962, 0.7903, 0.0746, 0.4805])


In [4]:
import torch.optim as optim
import csv 
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() == True else "cpu"
criterion1 = nn.BCELoss()                    # For the first element of the 8 element yolo vector              ## (3)
criterion2 = nn.MSELoss()                    # For the regression elements (indexed 2,3,4,5) of yolo vector   ## (4)
criterion3 = nn.CrossEntropyLoss()           # For the last three elements of the 8 element yolo vector        ## (5)
sigmoid = nn.Sigmoid()
softmax = nn.functional.softmax
# print("\n\nLearning Rate: ", self.rpg.learning_rate)
optimizer = optim.Adam(ynet.parameters(), lr=1e-4)                 ## (6)
epochs = 2
# total_loss = []
logger = open('./solutions/test.csv', 'a')
loss_flag = 1e32
ynet = ynet.to(device)
for epoch in range(epochs):
    running_loss = 0
    for i, d in tqdm(enumerate(train_loader)):
        inp, _, true_yolo_aug = d
        optimizer.zero_grad()
        inp = inp.to(device)
        true_yolo_aug = true_yolo_aug.to(device)

        out = ynet(inp)
        out = out.view(batch_size, 64, 5, -1)
        # print(out.shape)
        # pred_objectness = out[:, :, :, 0]
        # regression_box = out[:, :, :, 1:5]
        # pred_labels = out[:, :, :, 5:-1]
        # print(objectness.shape)

        # print(true_objectness.shape)
        # present_obj = torch.where(true_yolo_aug[:, :, :, 0])#.unsqueeze(0)
        present_obj = torch.nonzero(true_yolo_aug[:, :, :, 0])#.unsqueeze(0)

        # print(present_obj)
        pred_objectness = torch.zeros((len(present_obj)))
        true_objectness = torch.ones((len(present_obj)))


        pred_regression_box = torch.zeros((len(present_obj), 4))
        true_regression_box = torch.zeros((len(present_obj), 4))

        pred_labels = torch.zeros((len(present_obj)))
        true_labels = torch.zeros((len(present_obj)))

        for i, p in enumerate(present_obj):
            # pred_objectness[i] = out[p[0], p[1], p[2], 0]
            # print(p)
            pred_regression_box[i] = out[p[0], p[1], p[2], 1:5]
            true_regression_box[i] = true_yolo_aug[p[0], p[1], p[2], 1:5]

            pred_labels[i] = torch.argmax(softmax(out[p[0], p[1], p[2], 5:-1], dim=0))
            true_labels[i] = torch.argmax(true_yolo_aug[p[0], p[1], p[2], 5:-1])
        
        loss_BCE = criterion1(sigmoid(pred_objectness.unsqueeze(0)), true_objectness.unsqueeze(0))
        # print(loss_BCE)

        loss_MSE = criterion2(pred_regression_box.unsqueeze(0), true_regression_box.unsqueeze(0))
        # print(loss_MSE)

        loss_CE = criterion3(pred_labels.unsqueeze(0), true_labels.unsqueeze(0))
        # print(loss_CE)

        total_loss = loss_BCE + loss_MSE + loss_CE
        total_loss.backward()
        optimizer.step()

        running_loss += total_loss.item()

        if i % 100 == 0:
            data = [epoch + 1, i + 1, loss_BCE.item(), loss_MSE(), loss_CE.item(), running_loss / 100]
            with logger:
                write = csv.writer(logger)
                write.writerow(data)

            if running_loss < loss_flag:
                loss_flag = running_loss

                torch.save(ynet.state_dict(), "./solutions/" + 'net_name' + ".pt")
            running_loss = 0.0

431it [16:49,  2.34s/it]


KeyboardInterrupt: 

In [45]:
import torch 
a = torch.randn((3, 6, 2, 2))
# print(a)
b = a[:,:,:,0]
print(b.shape)
vals, args = torch.max(b, dim=-1)
top5 = torch.argsort(vals, descending=True, dim=-1)[:, :5]
print(top5)
a[0, top5[0]].shape

torch.Size([3, 6, 2])
tensor([[2, 3, 5, 0, 4],
        [5, 2, 3, 0, 1],
        [1, 3, 0, 2, 5]])


torch.Size([5, 2, 2])

In [21]:
out.view(4, 64, 5, -1).shape

torch.Size([4, 64, 5, 9])

In [6]:
# from dataset import MyDataset

# from torch.utils.data import DataLoader

# categories = ["bus", "cat", "pizza"]
# batch_size=4

# train_dataset = MyDataset(categories=categories, split="train")
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size)