In [None]:
import json
import os
from pathlib import Path
import time
import copy

import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from fastai.dataset import open_image
import json
from PIL import ImageDraw, ImageFont
import matplotlib.pyplot as plt
from matplotlib import patches, patheffects
import cv2
from tqdm import tqdm

In [None]:
# params
SIZE = 224
EPOCHS = 5
BATCH_SIZE = 32
NUM_WORKERS = 4
SHOW_IMAGES = False

# static
IMAGES = 'images'
ANNOTATIONS = 'annotations'
CATEGORIES = 'categories'
ID = 'id'
NAME = 'name'
IMAGE_ID = 'image_id'
BBOX = 'bbox'
CATEGORY_ID = 'category_id'
FILE_NAME = 'file_name'
IMAGE = 'image'
CATEGORY = 'category'
TRAIN = 'train'
VAL = 'val'
TEST = 'test'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
print('device:', device)

In [None]:
!ls ../input/pascal/pascal

In [None]:
PATH = Path('../input/pascal/pascal')
list(PATH.iterdir())

In [None]:
train_data = json.load((PATH/'pascal_train2007.json').open())
val_data = json.load((PATH/'pascal_val2007.json').open())
test_data = json.load((PATH/'pascal_test2007.json').open())

print('train:', train_data.keys())
print('val:', val_data.keys())
print('test:', test_data.keys())

In [None]:
train_data[ANNOTATIONS][:1]

In [None]:
train_data[IMAGES][:2]

In [None]:
len(train_data[CATEGORIES])

In [None]:
next(iter(train_data[CATEGORIES]))

In [None]:
categories = {c[ID]:c[NAME] for c in train_data[CATEGORIES]}

# all categories are the same
val_categories = {c[ID]:c[NAME] for c in val_data[CATEGORIES]}
test_categories = {c[ID]:c[NAME] for c in test_data[CATEGORIES]}
assert categories == val_categories == test_categories

print('category count:', len(categories))
print(categories)

In [None]:
IMAGE_PATH = Path(PATH/'JPEGImages/')
list(IMAGE_PATH.iterdir())[:2]

Helper functions for setting up `pandas.DataFrame` fed to the torch `Dataset`

In [None]:
def get_filenames(data):
    filenames = {o[ID]:o[FILE_NAME] for o in data[IMAGES]}
    print('get_id_filename_dict')
    print('length:', len(filenames), 'next item:', next(iter(filenames.items())))
    return filenames

In [None]:
def get_image_ids(data):
    image_ids = [o[ID] for o in data[IMAGES]]
    print('get_image_ids')
    print('length:', len(image_ids), 'next item:', image_ids[0])
    return image_ids

In [None]:
def pascal_bb_hw(bb):
    return bb[2:]

bbox = train_data[ANNOTATIONS][0][BBOX]
pascal_bb_hw(bbox)

In [None]:
def get_image_w_area(data, image_ids):
    image_w_area = {i:None for i in image_ids}
    image_w_area = copy.deepcopy(image_w_area)
    for x in data[ANNOTATIONS]:
        bbox = x[BBOX]
        new_category_id = x[CATEGORY_ID]
        image_id = x[IMAGE_ID]
        h, w = pascal_bb_hw(bbox)
        new_area = h*w
        cat_id_area = image_w_area[image_id]
        if not cat_id_area:
            image_w_area[image_id] = (new_category_id, new_area)
        else:
            category_id, area = cat_id_area
            if new_area > area:
                image_w_area[image_id] = (new_category_id, new_area)
    print('get_image_w_area')
    print('length:', len(image_w_area), 'next item:', next(iter(image_w_area.items())))
    return image_w_area

train data structs

In [None]:
train_filenames = get_filenames(train_data)
train_image_ids = get_image_ids(train_data)
train_image_w_area = get_image_w_area(train_data, train_image_ids)

## Locate the largest object's bbox

In [None]:
train_data[ANNOTATIONS][0]

In [None]:
def get_image_w_bbox(data, image_ids):
    image_w_bbox = {i:None for i in image_ids}
    for x in data[ANNOTATIONS]:
        new_bbox = x[BBOX]
        new_category_id = x[CATEGORY_ID]
        image_id = x[IMAGE_ID]
        h, w = pascal_bb_hw(new_bbox)
        new_area = h*w
        existing = image_w_bbox[image_id]
        if not existing:
            image_w_bbox[image_id] = (new_area, new_bbox, new_category_id)
        else:
            area, *_, area = existing
            if new_area > area:
                image_w_bbox[image_id] = (new_area, new_bbox, new_category_id)
    return image_w_bbox

image_w_bbox = get_image_w_bbox(train_data, train_image_ids)

## Image functions

In [None]:
def show_img(im, figsize=None, ax=None):
    if not ax:
        fig,ax = plt.subplots(figsize=figsize)
    ax.imshow(im)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    return ax

def draw_rect(ax, b):
    patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor='white', lw=2))
    draw_outline(patch, 4)
    
def draw_outline(o, lw):
    o.set_path_effects([patheffects.Stroke(
        linewidth=lw, foreground='black'), patheffects.Normal()])
    
def draw_text(ax, xy, txt, sz=14):
    text = ax.text(*xy, txt,
        verticalalignment='top', color='white', fontsize=sz, weight='bold')
    draw_outline(text, 1)

show image with bbox - using the above image functions

In [None]:
# image_id, (area, bbox, cat) = next(iter(image_w_bbox.items()))
# fn = f'{IMAGE_PATH}/{train_filenames[image_id]}'
# im = open_image(fn)
# ax = show_img(im)
# b = bbox
# print(b)
# draw_rect(ax, b)
# draw_text(ax, b[:2], categories[cat])

In [None]:
def scale_pascal_bb(bbox, image, size):
    """
    Returns a bbox scaled to the target `size`
    
    Args:
        bbox (1d array): pascal_bb [x, y, x2, y2]
        image (3d array): HWC
        size (scalar): target image size that bbox should be scaled to
    """
    im_w = image.shape[1]
    im_h = image.shape[0]
    return np.multiply(np.divide(bbox, [im_w, im_h, im_w, im_h]), size)

In [None]:
# image_id, (area, bbox, cat) = next(iter(image_w_bbox.items()))
# fn = f'{IMAGE_PATH}/{train_filenames[image_id]}'
# im = open_image(fn)
# resized_image = cv2.resize(im, (SIZE, SIZE)) # HW
# ax = show_img(resized_image)
# size=224
# b = scale_pascal_bb(bbox, im, size)
# print(b)
# draw_rect(ax, b)
# draw_text(ax, b[:2], categories[cat])

# Dataset

Scales image to (3, 224, 224) for Resnet18

Also scales the bbox

In [None]:
next(iter(train_filenames.items()))

In [None]:
def get_full_filenames(id_filename_dict):
    """
    Returns a list of 2 item tuples (image_id, image_full_path)
    
    Args:
        id_filename_dict (dict): {image_id: filename}
    """
    return [
         (k, f'{IMAGE_PATH}/{v}')
      for k,v in id_filename_dict.items()]

train_full_filenames = get_full_filenames(train_filenames)

In [None]:
tff = next(iter(train_full_filenames))
tff

In [None]:
open_image(tff[1]).shape

In [None]:
class BboxDataset(Dataset):
    def __init__(self, full_filenames, data_bbox):
        """
        Args:
            full_filenames (list): [(image_id, image_full_path), ...]
            data_bbox (dict): {image_id: (area, bbox, cat), ...}
        """
        self.full_filenames = full_filenames
        self.data_bbox = data_bbox

    def __len__(self):
        return len(self.full_filenames)
    
    def __getitem__(self, idx):
        image_id, image_path = self.full_filenames[idx]
        im = open_image(image_path) # HW
        resized_image = cv2.resize(im, (SIZE, SIZE)) # HW
        image = np.transpose(resized_image, (2, 0, 1)) # CHW
        
        _, bbox, cat = self.data_bbox[image_id]
        scaled_bbox = scale_pascal_bb(bbox, im, SIZE)

        return image_id, image, (scaled_bbox, cat)
    
dataset = BboxDataset(train_full_filenames, image_w_bbox)
idx = 0
image_id, inputs, label = dataset[0]

In [None]:
inputs.shape

In [None]:
label

### Val Dataset

val data structs

In [None]:
val_filenames = get_filenames(val_data)
val_image_ids = get_image_ids(val_data)
val_image_w_area = get_image_w_area(val_data, val_image_ids)

In [None]:
val_full_filenames = get_full_filenames(val_filenames)
next(iter(val_full_filenames))

In [None]:
val_image_w_bbox = get_image_w_bbox(val_data, val_image_ids)

In [None]:
val_dataset = BboxDataset(val_full_filenames, val_image_w_bbox)

In [None]:
def preview_data(data):
    print(f'type: {type(data)}')
    if isinstance(data, (list, tuple)):
        return data[0]
    elif isinstance(data, dict):
        return next(iter(data.items()))
    else:
        raise TypeError(f"Unsupported type: {type(data)}")

In [None]:
preview_data(val_filenames)

In [None]:
val_data[ANNOTATIONS][0]

In [None]:
val_image_w_bbox = get_image_w_bbox(val_data, val_image_ids)
next(iter(val_image_w_bbox.items()))

In [None]:
print(preview_data(val_full_filenames))
print(preview_data(val_image_w_bbox))

In [None]:
val_dataset = BboxDataset(val_full_filenames, val_image_w_bbox)
idx = 2
image_id, inputs, label = val_dataset[idx]
image_id, inputs.shape, label

In [None]:
label

In [None]:
# (bbox, cat) = label
# ax = show_img(np.transpose(inputs, (1, 2, 0)))
# b = bbox
# print(b)
# draw_rect(ax, b)
# draw_text(ax, b[:2], categories[cat])

show the above image for the ground truth to see if it's correct

In [None]:
image_id

In [None]:
idx = 2
image_id, filename = val_full_filenames[idx]
image_id, filename

In [None]:
area, bbox, cat = val_image_w_bbox[image_id]
area, bbox, cat

In [None]:
categories[cat]

In [None]:
im = open_image(filename)
im.shape

In [None]:
def fastai_bb(bb):
    return np.array([bb[1], bb[0], bb[3]+bb[1]-1, bb[2]+bb[0]-1])

print(bbox)
print(fastai_bb(bbox))

In [None]:
fbb = fastai_bb(bbox)
fbb

In [None]:
def fastai_bb_hw(bb):
    h= bb[3]-bb[1]+1
    w = bb[2]-bb[0]+1
    return [h,w]

fastai_bb_hw(fbb)

In [None]:
def pascal_bb_hw(bb):
    return bb[2:]

bbox = train_data[ANNOTATIONS][0][BBOX]
pascal_bb_hw(bbox)

# DataLoader

In [None]:
BATCH_SIZE = 64
NUM_WORKERS = 0

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE,
                        shuffle=True, num_workers=NUM_WORKERS)

batch_image_ids, batch_inputs, batch_labels = next(iter(dataloader))

In [None]:
batch_inputs.size()

In [None]:
# batch_labels is a list, the first item is the "batch bbox's", 2nd item is "batch categories"

In [None]:
len(batch_labels)

In [None]:
batch_labels[0].size()

In [None]:
batch_labels[1].size()

In [None]:
# train the model

In [None]:
NUM_CATEGORIES = len(categories)
NUM_CATEGORIES

Val DataLoader

In [None]:
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            shuffle=True, num_workers=NUM_WORKERS)

# Device check

In [None]:
print('DEVICE:', device)

In [None]:
model_ft = models.resnet18(pretrained=True)

for layer in model_ft.parameters():
    layer.requires_grad = False
    
num_ftrs = model_ft.fc.in_features
IN_FEATURES = num_ftrs

print(IN_FEATURES, NUM_CATEGORIES)

# model_ft.fc = nn.Linear(num_ftrs, NUM_CATEGORIES)

# model_ft = model_ft.to(device)

# criterion = nn.CrossEntropyLoss()

# # Observe that all parameters are being optimized
# optimizer = optim.SGD(model_ft.parameters(), lr=0.01, momentum=0.9)

# Custom head with single Layer fork to 2 outputs

`[bbox preds, category preds]`

In [None]:
class BboxAndCatLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.cat_layer = nn.Linear(IN_FEATURES, NUM_CATEGORIES)
        self.bbox_layer = nn.Linear(IN_FEATURES, 4)
        
    def forward(self, x):
        return (self.bbox_layer(x), self.cat_layer(x), )
    
model_ft.fc = BboxAndCatLayer()

model_ft = model_ft.to(device)

In [None]:
image_ids, inputs, labels = next(iter(dataloader))

In [None]:
print(len(labels))
print(type(labels))
print(labels[0].size())
print(labels[0].dtype)
print(labels[1].size())
print(labels[1].dtype)

In [None]:
(bbox_labels, cat_labels) = labels

In [None]:
inputs.shape

In [None]:
bbox_labels.size()

In [None]:
cat_labels.size()

In [None]:
inputs = inputs.to(device)
outputs = model_ft(inputs)

In [None]:
type(outputs)

In [None]:
len(outputs)

In [None]:
type(outputs[0]), type(outputs[1])

In [None]:
outputs[0].size()

In [None]:
outputs[1].size()

In [None]:
bbox_outputs, cat_outputs = outputs
bbox_outputs.size(), cat_outputs.size()

In [None]:
_, cat_preds = torch.max(cat_outputs, dim=1)
cat_preds.size()

In [None]:
preds = [bbox_outputs, cat_outputs]

print(preds[0].size())
print(preds[0].dtype)
print(preds[1].size())
print(preds[1].dtype)

# Loss Function - for custom head

In [None]:
class BboxAndCatLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.bbox_loss = nn.L1Loss()
        self.cat_loss = nn.CrossEntropyLoss()
        
    def forward(self, preds, targets):
        bbox_preds, cat_preds = preds
        bbox_targets, cat_targets = targets

        # data munging
        bbox_preds = bbox_preds.to(device, torch.float64)
        cat_targets_0_indexed = cat_targets - 1
        
        # per type of loss
        bbox_loss = self.bbox_loss(bbox_preds, bbox_targets)
        cat_loss = self.cat_loss(cat_preds, cat_targets_0_indexed)
        # cast b/c bbox_loss.dtype == torch.float64
        cat_loss = cat_loss.to(device, dtype=torch.float64)
        
        return bbox_loss + cat_loss
    
criterion = BboxAndCatLoss()

preds = [p.to(device) for p in preds]
labels = [x.to(device) for x in labels]

loss = criterion(preds, labels)
loss

# Train the model

In [None]:
# Observe that all parameters are being optimized
optimizer = optim.SGD(model_ft.parameters(), lr=0.01, momentum=0.9)

In [None]:
TRAIN = 'train'
VAL = 'val'

dataloaders = {
    TRAIN: dataloader,
    VAL: val_dataloader
}

dataset_sizes = {
    TRAIN: len(dataset),
    VAL: len(val_dataset)
}

In [None]:
EPOCHS = 1

epoch_losses = {TRAIN: [], VAL: []}
epoch_accuracies = {TRAIN: [], VAL: []}

for epoch in tqdm(range(EPOCHS)):
    print('epoch:', epoch)
    
    for phase in [TRAIN, VAL]:
        running_loss = 0.0
        running_correct = 0

        for image_ids, inputs, labels in dataloader:
            inputs = inputs.to(device)

            # labels - separate to.(device) b/c labels is a list
            (bbox_labels, cat_labels) = labels
            bbox_labels = bbox_labels.to(device)
            cat_labels = cat_labels.to(device)

            # clear gradients
            optimizer.zero_grad()

            # forward pass
            outputs = model_ft(inputs)
            bbox_outputs, cat_outputs = outputs
            _, preds = torch.max(cat_outputs, dim=1)
            labels = [x.to(device) for x in labels]
            loss = criterion(outputs, labels)

            # backwards pass
            if phase == TRAIN:
                loss.backward()
                optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            labels_0_indexed = cat_labels - 1
            running_correct += torch.sum(preds == labels_0_indexed)

        # per epoch/phase
        epoch_loss = running_loss / len(dataset)
        epoch_acc = running_correct.double().item() / len(dataset)
        epoch_losses[phase].append(epoch_loss)
        epoch_accuracies[phase].append(epoch_acc)
    print('train loss:', epoch_losses[TRAIN], 'train acc:', epoch_accuracies[TRAIN], 'val loss:', epoch_losses[VAL], 'val acc:', epoch_accuracies[VAL])

In [None]:
epoch_loss = running_loss / len(dataset)
epoch_acc = running_correct.double().item() / len(dataset)
epoch_losses.append(epoch_loss)
epoch_accuracies.append(epoch_acc)
print('loss:', epoch_loss, 'acc:', epoch_acc)

In [None]:
# NEXT - epoch - w/ train and val

Graph loss and accuracy

In [None]:
epoch_losses

In [None]:
epoch_accuracies

In [None]:
# check predictions

In [None]:
plt.plot(epoch_losses['train'])
plt.plot(epoch_losses['val'])

In [None]:
plt.plot(epoch_accuracies['train'])
plt.plot(epoch_accuracies['val'])

In [None]:
preds_count = len(preds)
fig, axes = plt.subplots(1, preds_count, figsize=(16, 16))
for i, ax in enumerate(axes.flat):
    im = np.transpose(inputs[i], (1, 2, 0))
    ax = show_img(im, ax=ax)
    draw_text(ax, (0,0), categories[preds[i].item()+1])

In [None]:
epoch_losses = []
epoch_accuracies = []

for epoch in tqdm(range(EPOCHS)):
    print('epoch:', epoch)
    running_loss = 0.0
    running_correct = 0

    for image_ids, inputs, labels in dataloader:
        inputs = inputs.to(device)
        
        # labels
        (bbox_labels, cat_labels) = labels
        bbox_labels = bbox_labels.to(device)
        cat_labels = cat_labels.to(device)

        # clear gradients
        optimizer.zero_grad()
        
        # forward pass
        outputs = model_ft(inputs)
        bbox_outputs, cat_outputs = outputs
        _, preds = torch.max(cat_outputs, dim=1)
        labels = [x.to(device) for x in labels]
        loss = criterion(outputs, labels)
        
        # backwards pass
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
        labels_0_indexed = cat_labels - 1
        running_correct += torch.sum(preds == labels_0_indexed)
        break
    break
        
    epoch_loss = running_loss / len(dataset)
    epoch_acc = running_correct.double().item() / len(dataset)
    epoch_losses.append(epoch_loss)
    epoch_accuracies.append(epoch_acc)
    print('loss:', epoch_loss, 'acc:', epoch_acc)

In [None]:
inputs = inputs.to(device)
labels = labels.to(device)

In [None]:
outputs = model_ft(inputs)
_, preds = torch.max(outputs, dim=1)
labels_0_indexed = labels - 1

In [None]:
torch.sum(labels_0_indexed == preds)

In [None]:
labels_0_indexed[0]

In [None]:
preds[0]

In [None]:
categories[2]

In [None]:
inputs[0].shape

In [None]:
# plt.imshow(np.transpose(inputs[0], (1, 2, 0)))

In [None]:
labels

In [None]:
labels-1

In [None]:
criterion(outputs, labels-1)

In [None]:
torch.sum(preds == labels)

In [None]:
a = torch.Tensor([1, 2])
b = torch.Tensor([1, 3])
torch.sum(a == b)

In [None]:
t = torch.randn(2, 3)
torch.max(t, dim=1)

In [None]:
inputs.shape

In [None]:
to_nchw(inputs).shape

In [None]:
train_df.iloc[0]

In [None]:
image1_filename = train_df.iloc[0][IMAGE]
image1_filename

In [None]:
im = cv2.imread(image1_filename)

In [None]:
im.shape

In [None]:
im = cv2.imread(image1_filename, 0)
im.shape

In [None]:
np.array([im]).shape

In [None]:
im.shape

In [None]:
type(im)

In [None]:
cv2.resize()

In [None]:
import cv2
SIZE = 224
resized_im = cv2.resize(im, (SIZE, SIZE))
plt.imshow(resized_im)

In [None]:
resized_im = cv2.resize(im, (SIZE, SIZE))
plt.imshow(resized_im)

In [None]:
scaled_item = scale(item[IMAGE], item[CATEGORY])
len(scaled_item) 

In [None]:
scaled_x, scaled_y = scaled_item
scaled_x.shape

In [None]:
scaled_y

In [None]:
train_df.iloc[0]

In [None]:
def show_random_image():
    item = dataset[np.random.randint(0, 100)]
    print(categories[item[CATEGORY]])
    plt.imshow(item[IMAGE])

show_random_image()

In [None]:
dataloader = DataLoader(dataset, batch_size=4,
                        shuffle=True, num_workers=4)

next(iter(dataloader))

In [None]:
import torchvision

def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated

# Get a batch of training data
inputs, classes = next(iter(dataloader))

In [None]:
# # Make a grid from batch
# out = torchvision.utils.make_grid(inputs)

# imshow(out, title=[class_names[x] for x in classes])

In [None]:
classes

In [None]:
train_filenames

In [None]:
train_image_w_area

In [None]:
# object classification

In [None]:
model = models.resnet18(pretrained=True)

In [None]:
for layer in model.parameters():
    layer.requires_grad = False

In [None]:
fc_in_features = model.fc.in_features
fc_in_features

In [None]:
model.fc = nn.Linear(fc_in_features, len(categories))

In [None]:
loss_func = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [None]:
# need to read images with "skimage" or "cv2" and do DataSet / DataLoader config first before running model

In [None]:
from skimage import io

In [None]:
image0 = io.imread(list(IMAGE_PATH.iterdir())[0])
image0.shape