# Download and unzip the PascalVoc dataset

In [13]:
import torchvision
from torchvision import models
from torchvision.datasets import VOCSegmentation

data_dir = './voc'


train_dataset = VOCSegmentation(root=data_dir, year='2007', download=False, image_set='train')
val_dataset = VOCSegmentation(root=data_dir, year='2007', download=False, image_set='val')
test_dataset = VOCSegmentation(root=data_dir, year='2007', download=False, image_set='test')


In [2]:
print(f"Train set size: {len(train_dataset)}")
print(f"Val set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")


Train set size: 209
Val set size: 213
Test set size: 210


# Data Preparation


run the code below to get thre dataloader objects, namely: train_loader, val_loader and test_loader

In [3]:
import os
from PIL import Image
from torch.utils import data
import torchvision.transforms as transforms
import random

num_classes = 21
ignore_label = 255
root = data_dir

'''
color map
0=background, 1=aeroplane, 2=bicycle, 3=bird, 4=boat, 5=bottle # 6=bus, 7=car, 8=cat, 9=chair, 10=cow, 11=diningtable,
12=dog, 13=horse, 14=motorbike, 15=person # 16=potted plant, 17=sheep, 18=sofa, 19=train, 20=tv/monitor
'''


#Feel free to convert this palette to a map
palette = [0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, 0, 128, 128,
           128, 128, 128, 64, 0, 0, 192, 0, 0, 64, 128, 0, 192, 128, 0, 64, 0, 128, 192, 0, 128,
           64, 128, 128, 192, 128, 128, 0, 64, 0, 128, 64, 0, 0, 192, 0, 128, 192, 0, 0, 64, 128]  #3 values- R,G,B for every class. First 3 values for class 0, next 3 for
#class 1 and so on......

'''
Depending on the mode, train or val or test, the function reads the train.txt, val.txt and test.txt files and returns a list of tuples of the form
(image_path, mask_path) for each image in the dataset, where image_path is the path to the image and mask_path is the path to the mask for that image. 
'''
def make_dataset(mode):
    
    assert mode in ['train', 'val', 'test', 'trainval']
    
    items = []
    img_path = os.path.join(root, 'VOCdevkit', 'VOC2007', 'JPEGImages')
    mask_path = os.path.join(root, 'VOCdevkit', 'VOC2007', 'SegmentationClass')
    data_list = [l.strip('\n') for l in open(os.path.join(
        root, 'VOCdevkit', 'VOC2007', 'ImageSets', 'Segmentation', f'{mode}.txt')).readlines()]
    for it in data_list:
        item = (os.path.join(img_path, it + '.jpg'), os.path.join(mask_path, it + '.png'))
        items.append(item)

        
    return items


'''
The class VOC is a subclass of the class torch.utils.data.Dataset. It overrides the __len__ and __getitem__ methods.
The __len__ method returns the length of the dataset, i.e. the number of images in the dataset.
The __getitem__ method returns the image and the mask for the given index.
'''

class VOC(data.Dataset):
    def __init__(self, mode, transform=None, target_transform=None, common_transform=None):
        self.imgs = make_dataset(mode)        
        if len(self.imgs) == 0:
            raise RuntimeError('Found 0 images, please check the data set')
        self.mode = mode
        self.transform = transform
        self.target_transform = target_transform
        self.common_transform = common_transform
        self.width = 256
        self.height = 256

    def __getitem__(self, index):
        
        img_path, mask_path = self.imgs[index]
        img = Image.open(img_path).convert('RGB').resize((self.width, self.height))
        mask = Image.open(mask_path).resize((self.width, self.height))

        if self.common_transform is not None:
            img, mask = self.common_transform((img,mask)) 

        if self.transform is not None:
            img = self.transform(img)
        if self.target_transform is not None:
            mask = self.target_transform(mask)

        mask[mask==ignore_label]=0

        return img, mask

    def __len__(self):
        
        return len(self.imgs)

In [4]:
import numpy as np 
import torch
class MaskToTensor(object):
    def __call__(self, img):
        return torch.from_numpy(np.array(img, dtype=np.int32)).long()


mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

input_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(*mean_std)
])

target_transform = transforms.Compose([
    MaskToTensor()
])

original_train_dataset = VOC('trainval', transform=input_transform, target_transform=target_transform)
# original_val_dataset = VOC('val', transform=input_transform, target_transform=target_transform)
original_test_dataset = VOC('test', transform=input_transform, target_transform=target_transform)


In [5]:
from torch.utils.data import DataLoader, ConcatDataset

NUM_WORKERS = 2
PREFETCH_FACTOR = 2 # improves data transfer speed between GPU and CPU and reduces GPU wait time
train_loader = DataLoader(dataset=original_train_dataset, batch_size=16, shuffle=True, num_workers=NUM_WORKERS, prefetch_factor=PREFETCH_FACTOR, pin_memory=True)

# val_loader = DataLoader(dataset=original_val_dataset, batch_size=16, shuffle=False, num_workers=NUM_WORKERS, prefetch_factor=PREFETCH_FACTOR, pin_memory=True)

test_loader = DataLoader(dataset=original_test_dataset, batch_size= 16, shuffle=False, num_workers=NUM_WORKERS, prefetch_factor=PREFETCH_FACTOR, pin_memory=True)


In [6]:
# from torchvision.datasets import Cityscapes

# data_dir = "./data/cityscapes"

# train_dataset = Cityscapes(root=data_dir, split="train", mode="fine", target_type="semantic", transform=input_transform, target_transform=target_transform)

# utils 

In [7]:
from copy import deepcopy
def iou(pred, target, n_classes = 21):
    target[target==255] = 0

    ious = []

    for cls in range(n_classes):
        intersection = torch.sum((pred == cls) & (target == cls)).item()
        union = torch.sum(pred == cls) + torch.sum(target == cls) - intersection
        union = union.item()
        if union!=0:
            ious.append(intersection/union)

    ious = np.array(ious)
    return ious

'''
returns pixel accuracy for the batch
'''
def pixel_acc(pred, target):
    target[target==255] = 0
    
    correct = torch.sum(pred==target).item()
    total_predictions = target.shape[0]*target.shape[1]*target.shape[2]
    return correct/total_predictions

In [8]:
import time
import torch.nn.functional as F

criterion = torch.nn.CrossEntropyLoss()

def train(model=None):

    model_ = model 
    torch.autograd.set_detect_anomaly(True)
    
    best_iou_score = 0.0

    trainEpochLoss = []
    trainEpochAccuracy = []
    trainEpochIOU = []
    valEpochLoss = []
    valEpochAccuracy = []
    valEpochIOU = []

    for epoch in range(epochs):

        # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
        train_loss = []
        train_acc = []
        train_iou = []

        ts = time.time()
        for itr, (inputs, labels) in enumerate(train_loader):
            #   reset optimizer gradients
            optimizer.zero_grad()

            inputs =  inputs.to(device)
            labels =   labels.to(device)

            trainOutputs =  model_(inputs)
#             trainOutputs = F.softmax(trainOutputs, dim=1)
            loss =  criterion(trainOutputs,labels)  #  calculate loss
            loss.backward()

            with torch.no_grad():
                # To compute accuracy and IOU
                # outputs = F.log_softmax(model_(inputs), dim=1)
                _, pred = torch.max(trainOutputs, dim=1)
                
                train_iou.append(np.mean(iou(pred, labels)))
                train_acc.append(pixel_acc(pred, labels))
                train_loss.append(loss.item())

            optimizer.step()

            if itr % 10 == 0:
                print(f"==> epoch{epoch}, iter{itr+1}, Train set=> loss: {np.mean(train_loss)}, IOU: {np.mean(train_iou)}, Acc: {np.mean(train_acc)}")

        # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

        print(f"Finish epoch {epoch}, time elapsed {time.time() - ts}")

        val_loss, val_iou, val_acc = test(epoch,model_)
        model_.train()


In [9]:
def test(epoch, model=None):
    model_ = model
    model_.eval() # Put in eval mode (disables batchnorm/dropout) !
    
    losses = []
    mean_iou_scores = []
    accuracy = []

    with torch.no_grad(): # we don't need to calculate the gradient in the validation/testing
        num_iter = 0
        for iter, (inputs, labels) in enumerate(test_loader):
            
            # both inputs and labels have to reside in the same device as the model's
            inputs =  inputs.to(device)#  transfer the input to the same device as the model's
            labels =   labels.to(device)#  transfer the labels to the same device as the model's


            outputs = model_(inputs)
            outputs = F.softmax(outputs, dim=1)
#             valoutputs = model_(inputs)
            valloss = criterion(outputs, labels)
            
            num_iter += 1
            _, pred = torch.max(outputs, dim=1)
            
            mean_iou_scores.append(np.mean(iou(pred, labels)))
            accuracy.append(pixel_acc(pred, labels))
            losses.append(valloss.item())

    # print(mean_iou_scores, accuracy)
    print(f"=========> Loss at epoch {epoch} is {np.mean(losses)}")
    print(f"=========> IoU at epoch {epoch} is {np.mean(mean_iou_scores)}")
    print(f"=========> Pixel acc at epoch {epoch} is {np.mean(accuracy)}")

    
    return np.mean(losses), np.mean(mean_iou_scores), np.mean(accuracy)

# SSL models

In [16]:
import torch.nn as nn


class Backbone(nn.Module):
    def __init__(self, in_chan, out_dim):
        super(Backbone, self).__init__()
        
        
        modules = list(models.resnet18(weights=None).children())[:-1]
        self.model = nn.Sequential(*modules)
        self.linear = nn.Linear(512, 200)
    
    def forward(self, x):
        x = self.model(x)
        
        return x

In [28]:
import torch
import torch.nn as nn

class Segmentor(nn.Module):
    def __init__(self, n_class=21, n_dim=512):
        super(Segmentor, self).__init__()

        # Encoder (Based on the provided SSL architecture)
        self.encoder = Backbone(in_chan=3, out_dim=n_dim)
        
#         self.reducer = nn.Conv2d(256, 8, kernel_size=1)
        
        # do some drastic upsampling from (1,1) to (16, 16) to reduce overall parameters
        self.drastic = nn.ConvTranspose2d(512, 256, kernel_size=(16,16), stride = 1)
        
        # Decoder
        self.decoder4 = self.expanding_block(256, 64)
#         self.decoder3 = self.expanding_block(128, 64)
        self.decoder2 = self.expanding_block(64, 32)
#         self.decoder1 = self.expanding_block(32, 16)
        
        # Output layer
        self.output = nn.Conv2d(32, n_class, kernel_size=1)
        
    def expanding_block(self, in_channels, out_channels):
        block = nn.Sequential(
            nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(out_channels, out_channels, kernel_size=2, stride=2),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )
        return block
    
    def forward(self, x):
        # Encoder (SSL model)
        features = self.encoder(x)
        
#         features = self.reducer(features)
        features = self.drastic(features)
        
        # Decoder
        decode4 = self.decoder4(features)
#         decode3 = self.decoder3(decode4)
        decode2 = self.decoder2(decode4)
    
#         decode1 = self.decoder1(decode2)
        
        # Output
        output = self.output(decode2)
        output = nn.ReLU(inplace=True)(output)
        
        return output


In [29]:
from torch import optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Assuming 21 classes for segmentation
num_classes = 21


# Create the overall segmentor model
model = Segmentor(n_class=num_classes, n_dim=512)

In [30]:
# Load the SSL model weights
backbone_weights = torch.load('pet_resnet_dc_e5_c200_s8000.pth', map_location=torch.device(device))
model.encoder.load_state_dict(backbone_weights)

optimizer = optim.Adam(model.parameters(), lr=0.005)

model = model.to(device)

In [31]:
epochs = 20
train(model)
  

==> epoch0, iter1, Train set=> loss: 3.0943729877471924, IOU: 0.007042501509026087, Acc: 0.027179718017578125
==> epoch0, iter11, Train set=> loss: 2.5892494808543813, IOU: 0.024503151442388273, Acc: 0.3325041857632724
==> epoch0, iter21, Train set=> loss: 2.3988965352376304, IOU: 0.02929003619477605, Acc: 0.3806997480846587
Finish epoch 0, time elapsed 4.3575849533081055
==> epoch1, iter1, Train set=> loss: 1.9831830263137817, IOU: 0.03719551302506282, Acc: 0.46772003173828125
==> epoch1, iter11, Train set=> loss: 2.0511144291270864, IOU: 0.037313662248386954, Acc: 0.4480621164495295
==> epoch1, iter21, Train set=> loss: 1.9763928311211723, IOU: 0.03912434078903059, Acc: 0.47707044510614305
Finish epoch 1, time elapsed 4.325148820877075
==> epoch2, iter1, Train set=> loss: 1.688730239868164, IOU: 0.03843193564115774, Acc: 0.6223211288452148
==> epoch2, iter11, Train set=> loss: 1.5164590640501543, IOU: 0.04887088178658623, Acc: 0.687730529091575
==> epoch2, iter21, Train set=> loss: 1

In [None]:
# # unfreeze the backbone
# for param in model.encoder.parameters():
#     param.requires_grad = True

    
# # train further
# epochs = 20

# train(model)
