In [1]:
# header files needed
import numpy as np
import torch
import torch.nn as nn
import torchvision
import random
from PIL import Image
import glob

In [2]:
# ensure the experiment produces same result on each run
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# transforms
input_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize((520, 480)),    
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

target_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize((520, 480)),
    torchvision.transforms.ToTensor()
])

In [None]:
# dataset
train_dataset = torchvision.datasets.VOCSegmentation("/content/drive/My Drive/", year='2012', image_set='train', download=False, transform=input_transform, target_transform=target_transform)
val_dataset = torchvision.datasets.VOCSegmentation("/content/drive/My Drive/", year='2012', image_set='val', download=False, transform=input_transform, target_transform=target_transform)

In [None]:
# data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=8)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4, shuffle=True, num_workers=8)

In [None]:
# loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = torch.nn.CrossEntropyLoss(ignore_index=255)

In [None]:
# model
class FCN8(torch.nn.Module):

  # init function
  def __init__(self, pretrained_net, num_classes=21):
    super(FCN8, self).__init__()

    # apply pretrained net
    self.pretrained_features_1 = torch.nn.Sequential(*list(pretrained_net.features.children())[:-20])
    self.pretrained_features_2 = torch.nn.Sequential(*list(pretrained_net.features.children())[-20:-10])
    self.pretrained_features_3 = torch.nn.Sequential(*list(pretrained_net.features.children())[-10:])

    self.encoder_fc_block_4 = torch.nn.Sequential(
        torch.nn.Conv2d(512, num_classes, kernel_size=1)
    )

    self.encoder_fc_block_3 = torch.nn.Sequential(
        torch.nn.Conv2d(256, num_classes, kernel_size=1)
    )

    self.encoder_fc = torch.nn.Sequential(  
              
        # fc
        torch.nn.Conv2d(512, 4096, kernel_size=7),
        torch.nn.ReLU(inplace=True),
        torch.nn.Dropout(),
        torch.nn.Conv2d(4096, 4096, kernel_size=1),
        torch.nn.ReLU(inplace=True),
        torch.nn.Dropout(),                                  # 8 x 8 x 4096
        torch.nn.Conv2d(4096, num_classes, kernel_size=1)
    )


  # forward func
  def forward(self, x):
    # get pretrained features from vgg16
    features_1 = self.pretrained_features_1(x)
    features_2 = self.pretrained_features_2(features_1)
    features_3 = self.pretrained_features_3(features_2)

    # fc
    enc_fc = self.encoder_fc(features_3)

    # upsample to match image spatial size
    output_1 = self.encoder_fc_block_4(features_2)
    output = torch.nn.functional.upsample_bilinear(enc_fc, output_1.size()[2:])
    output = output + output_1
    output_2 = self.encoder_fc_block_3(features_1)
    output = torch.nn.functional.upsample_bilinear(output, output_2.size()[2:])
    output = output + output_2
    output = torch.nn.functional.upsample_bilinear(output, [520, 480])

    # return the predicted label image
    return output

In [None]:
model = torchvision.models.vgg16_bn(pretrained=True)
model = FCN8(model, 21)
model.to(device)

In [None]:
# create optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.0005, momentum=0.95, weight_decay=1e-3)

In [None]:
# helper function to calc IoU over a sample
def batch_intersection_union(predict, target, num_class, labeled):
    predict = predict * labeled.long()
    intersection = predict * (predict == target).long()

    area_inter = torch.histc(intersection.float(), bins=num_class, max=num_class, min=1)
    area_pred = torch.histc(predict.float(), bins=num_class, max=num_class, min=1)
    area_lab = torch.histc(target.float(), bins=num_class, max=num_class, min=1)
    area_union = area_pred + area_lab - area_inter
    return area_inter.cpu().numpy(), area_union.cpu().numpy()

In [None]:
loss_train = []
loss_valid = []
acc_train = []
acc_valid = []

# training and val loop
for epoch in range(0, 250):

  # train
  model.train()
  training_loss = 0.0
  correct = 0
  total = 0
  acc_correct = 0.0
  acc_total = 0.0
  training_acc = 0.0
  for step, (images, labels) in enumerate(train_loader):
    
    # if cuda
    images = images.to(device)
    labels = labels.type(torch.LongTensor)
    labels = labels.reshape(labels.shape[0], labels.shape[2], labels.shape[3])
    labels = labels.to(device)
    
    # get predicted outputs
    optimizer.zero_grad()
    outputs = model(images)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    training_loss = training_loss + loss.item()

    _, predict = torch.max(outputs.data, 1)
    predict = predict.int() + 1
    target = labels.int() + 1
    labeled = (target > 0) * (target <= 21)
    total_corect = labeled.sum()
    pixel_correct = ((predict == target) * labeled).sum()
    pixel_correct = pixel_correct.cpu().numpy()
    pixel_labeled = total_corect.cpu().numpy()
    inter, union = batch_intersection_union(predict, target, 21, labeled)
    total += union
    correct += inter

    acc_output = outputs.argmax(1).flatten().cpu().numpy()
    acc_target = labels.flatten().cpu().numpy()
    acc_total += ((acc_target == acc_target) * (acc_target > 0)).sum()
    acc_correct += ((acc_output == acc_target) * (acc_target > 0)).sum()

  # update training loss and accuracy 
  training_loss = training_loss / float(len(train_loader))
  training_iou = 0.0
  t = 0.0
  u = 0.0
  for index in range(1, 21):
    if(total[index] > 0):
      t += total[index]
      u += correct[index]
  training_iou = np.round((u / t), 3)
  loss_train.append(training_loss)
  training_acc = np.round((acc_correct / (acc_total + 0.00005)), 4)

  
  model.eval()
  valid_loss = 0.0
  total = 0.0
  correct = 0.0
  acc_correct = 0.0
  acc_total = 0.0
  val_acc = 0.0
  for step, (images, labels) in enumerate(val_loader):
    with torch.no_grad():

      # if cuda
      images = images.to(device)
      labels = labels.type(torch.LongTensor)
      labels = labels.reshape(labels.shape[0], labels.shape[2], labels.shape[3])
      labels = labels.to(device)

      # get loss
      outputs = model(images)
      loss = criterion(outputs, labels)
      valid_loss = valid_loss + loss.item()

      _, predict = torch.max(outputs.data, 1)
      predict = predict.int() + 1
      target = labels.int() + 1
      labeled = (target > 0) * (target <= 21)
      total_corect = labeled.sum()
      pixel_correct = ((predict == target) * labeled).sum()
      pixel_correct = pixel_correct.cpu().numpy()
      pixel_labeled = total_corect.cpu().numpy()
      inter, union = batch_intersection_union(predict, target, 21, labeled)
      total += union
      correct += inter

      acc_output = outputs.argmax(1).flatten().cpu().numpy()
      acc_target = labels.flatten().cpu().numpy()
      acc_total += ((acc_target == acc_target) * (acc_target > 0)).sum()
      acc_correct += ((acc_output == acc_target) * (acc_target > 0)).sum()

  # update val loss and accuracy 
  valid_loss = valid_loss / float(len(val_loader))
  val_iou = 0.0
  t = 0.0
  u = 0.0
  for index in range(1, 21):
    if(total[index] > 0):
      t += total[index]
      u += correct[index]
  val_iou = np.round((u / t), 3)
  loss_valid.append(valid_loss)
  val_acc = np.round((acc_correct / (acc_total + 0.00005)), 4)

  print()
  print("Epoch: " + str(epoch))
  print("Training Loss: " + str(training_loss) + "    Validation Loss: " + str(valid_loss))
  print("Training Accuracy: " + str(training_acc) + "    Validation Accuracy: " + str(val_acc))
  print("Training mIoU: " + str(training_iou) + "    Validation mIoU: " + str(val_iou))
  print()