In [1]:
# model.py -> resnet.py (use v2) -> fcn-12.3.1.py
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim

import numpy as np
import PIL.Image

import os
import os.path
from pathlib import Path
import time

## Model

### `resnet_layer` (Basic Block)

In [2]:
class resnet_layer(nn.Module):
    def __init__(self,
                 in_channels,
                 num_filters=16,
                 kernel_size=3,
                 strides=1,
                 activation='relu',
                 batch_normalization=True,
                 conv_first=True):
        super(resnet_layer, self).__init__()

        self.batch_normalization = batch_normalization
        self.conv_first = conv_first

        out_channels = num_filters
        padding = kernel_size // 2      # 'same' padding

        self.conv = nn.Conv2d(in_channels,
                              out_channels,
                              kernel_size,
                              stride=strides,
                              padding=padding)

        # 'he_normal' initialization
        nn.init.kaiming_normal_(self.conv.weight)

        # l2 regularization
        # not implemented

        # batch normalization
        self.batchnorm_first = nn.BatchNorm2d(in_channels)
        self.batchnorm_last = nn.BatchNorm2d(num_filters)

        # activation (only 'relu' is implemented)
        if activation:
            self.activation = nn.ReLU()
        else:
            self.activation = None


    def forward(self, x):
        if self.conv_first:
            x = self.conv(x)
            if self.batch_normalization:
                x = self.batchnorm_last(x)
            if self.activation is not None:
                x = self.activation(x)
        else:
            if self.batch_normalization:
                x = self.batchnorm_first(x)
            if self.activation is not None:
                x = self.activation(x)
            x = self.conv(x)
        
        return x

### `conv_layer`

In [3]:
class conv_layer(nn.Module):
    def __init__(self,
                 in_channels,#input_shape,
                 filters=32,
                 kernel_size=3,
                 strides=1,
                 use_maxpool=True,
                 postfix=None,          # not implemented
                 activation=None):
        super(conv_layer, self).__init__()

        padding = kernel_size // 2      # 'same' padding
        self.conv = nn.Conv2d(in_channels,
                              filters,
                              kernel_size,
                              stride=strides,
                              padding=padding)
        # 'he_normal' initialization
        nn.init.kaiming_normal_(self.conv.weight)

        self.batchnorm = nn.BatchNorm2d(filters)
        self.activation = nn.ReLU()
        self.maxpool = nn.MaxPool2d(filters)

        self.use_maxpool = use_maxpool


    def forward(self, x):
        x = self.conv(x)
        x = self.batchnorm(x)
        x = self.activation(x)
        if self.use_maxpool:
            x = self.maxpool(x)
        
        return x

### `tconv_layer`

In [4]:
class tconv_layer(nn.Module):
    def __init__(self,
                 in_channels,
                 filters=32,
                 kernel_size=3,
                 strides=2,
                 postfix=None):
        super(tconv_layer, self).__init__()

        padding = kernel_size // 2                  # 'same' padding
        output_padding = kernel_size - 2*padding    # for odd paddings
        self.conv_transpose = nn.ConvTranspose2d(in_channels,
                                                 filters,
                                                 kernel_size,
                                                 stride=strides,
                                                 padding=padding,
                                                 output_padding=output_padding)
        # 'he_normal' initialization
        nn.init.kaiming_normal_(self.conv_transpose.weight)

        self.batchnorm = nn.BatchNorm2d(filters)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = self.conv_transpose(x)
        x = self.batchnorm(x)
        x = self.activation(x)

        return x

### `features_pyramid`

In [5]:
class features_pyramid(nn.Module):
    def __init__(self,
                 in_channels,
                 n_layers):
        super(features_pyramid, self).__init__()

        pool_size = 2
        self.avg_pool = nn.AvgPool2d(pool_size)
        
        n_layers = n_layers
        n_filters = 512

        self.conv_layers = nn.ModuleList()
        for i in range(n_layers - 1):
            postfix = "_layer" + str(i+2)
            layer = conv_layer(in_channels,
                              filters=n_filters,
                              kernel_size=3,
                              strides=2,
                              use_maxpool=False,
                              postfix=postfix)
            self.conv_layers.append(layer)
            in_channels = n_filters


    def forward(self, x):
        outputs = [x]
        conv = self.avg_pool(x)
        outputs.append(conv)
        prev_conv = conv

        # additional feature map layers
        for convlayer in self.conv_layers:
            conv = convlayer(prev_conv)
            outputs.append(conv)
            prev_conv = conv            

        return outputs

### `resnet_v2`

In [6]:
# backbone
class resnet_v2(nn.Module):
    """
    # Arguments
        input_shape (tensor): Shape of the input image tensor, assumed to be
                              (N, C, H, W) following PyTorch tensor convention
        depth (int): Number of convolutional layers
        num_classes (int): Number of classes
    """
    def __init__(self, input_shape, depth, n_layers=4):
        super(resnet_v2, self).__init__()

        # name = 'ResNet%dv2' % (depth)
        self.n_layers = n_layers

        # copied
        if (depth - 2) % 9 != 0:
            raise ValueError('depth should be 9n+2')

        num_filters_in = 16
        self.resnet1 = resnet_layer(input_shape[1],
                                    num_filters=num_filters_in,
                                    conv_first=True)
        
        self.resnet_blocks = nn.ModuleList()
        self.num_res_blocks = (depth - 2) // 9
        in_channels = num_filters_in
        for stage in range(3):
            initial_in_channels = in_channels
            for res_block in range(self.num_res_blocks):
                activation = 'relu'
                batch_normalization = True
                strides = 1
                if stage == 0:
                    num_filters_out = num_filters_in * 4
                    if res_block == 0:  # first layer and first stage
                        activation = None
                        batch_normalization = False
                else:
                    num_filters_out = num_filters_in * 2
                    if res_block == 0:  # first layer but not first stage
                        strides = 2 # downsample
                
                # bottleneck residual unit
                self.resnet_blocks.append(
                    resnet_layer(in_channels,
                                 num_filters=num_filters_in,
                                 kernel_size=1,
                                 strides=strides,
                                 activation=activation,
                                 batch_normalization=batch_normalization,
                                 conv_first=False))
                in_channels = num_filters_in

                self.resnet_blocks.append(
                    resnet_layer(in_channels,
                                 num_filters=num_filters_in,
                                 conv_first=False))
                in_channels = num_filters_in

                self.resnet_blocks.append(
                    resnet_layer(in_channels,
                                 num_filters=num_filters_out,
                                 kernel_size=1,
                                 conv_first=False))
                in_channels = num_filters_out

                if res_block == 0:
                    # linear projection residual shortcut connection to match
                    # changed dims
                    self.resnet_blocks.append(
                        resnet_layer(initial_in_channels,
                                     num_filters=num_filters_out,
                                     kernel_size=1,
                                     strides=strides,
                                     activation=None,
                                     batch_normalization=False))
                    in_channels = num_filters_out
            
            num_filters_in = num_filters_out    # don't use self.num_filters_in
                                                # since it persists between calls
                    # num_filters_out = initial_num_filters_in * 2 * 2 * 2
                    #                 = initial_num_filters_in * 8
                    #                 = 16 * 8 = 128
        
        self.batchnorm = nn.BatchNorm2d(num_filters_out)
        self.activation = nn.ReLU()

        self.features_pyramid = features_pyramid(num_filters_out,
                                                 n_layers)


    def forward(self, x):
        x = self.resnet1(x)
        # Instantiate the stack of residual units
        i = 0
        for stage in range(3):
            for res_block in range(self.num_res_blocks):
                y = self.resnet_blocks[i](x)
                i += 1
                y = self.resnet_blocks[i](y)
                i += 1
                y = self.resnet_blocks[i](y)
                i += 1
                if res_block == 0:
                    # linear projection residual shortcut connection to match
                    # changed dims
                    x = self.resnet_blocks[i](x)
                    i += 1
                x = x + y

        # v2 has BN-ReLU before Pooling
        x = self.batchnorm(x)
        x = self.activation(x)
        # 1st feature map layer

        # main feature maps (160, 120)
        # succeeding feature maps scaled down by
        # 2, 4, 8
        outputs = self.features_pyramid(x)
        
        return outputs

### `build_resnet`

In [7]:
def build_resnet(input_shape=(480, 640, 3),
                 n_layers=4,
                 n=6):
    depth = n * 9 + 2
    
    return resnet_v2(input_shape,
                     depth=depth,
                     n_layers=n_layers)

### `build_fcn`

In [8]:
# analog of `build_fcn`
class fcn(nn.Module):
    def __init__(self,
                 input_shape,
                 backbone,
                 n_layers=4,
                 n_classes=4):
        super(fcn, self).__init__()

        self.backbone = backbone

        size = (input_shape[-2] // 4, input_shape[-1] // 4)
        feature_size = 8
        scale_factor = 2
        filters = 256
        self.upsamplers = nn.ModuleList()
        self.conv_layers = nn.ModuleList()
        in_channels = 16 * 16
        total_channels = in_channels
        for _ in range(n_layers):
            postfix = "fcn_" + str(feature_size)
            self.conv_layers.append(
                conv_layer(in_channels,
                           filters=filters,
                           use_maxpool=False,
                           postfix=postfix)
            )
            in_channels = 512
            total_channels += filters
            
            postfix = postfix + "_up2d"
            # self.upsamplers.append(
            #     nn.Upsample(scale_factor=scale_factor,
            #                           mode='bilinear')
            # )
            self.upsamplers.append(
                nn.Upsample(size=size,
                            mode='bilinear')
            )
            
            # scale_factor *= 2
            feature_size *= 2
        
        in_channels = total_channels
        filters = 256
        self.tconv_layer1 = tconv_layer(in_channels,
                                        filters=filters,
                                        postfix="up_x2")
        
        in_channels = filters
        self.tconv_layer2 = tconv_layer(in_channels,
                                        filters=filters,
                                        postfix="up_x4")
        
        in_channels = filters
        kernel_size = 1
        padding = kernel_size // 2      # 'same' padding
        self.conv_transpose = nn.ConvTranspose2d(in_channels,
                                                 n_classes,
                                                 kernel_size,
                                                 stride=1,
                                                 padding=padding)
        # 'he_normal' initialization
        nn.init.kaiming_normal_(self.conv_transpose.weight)

        self.logsoftmax = nn.LogSoftmax()
    

    def forward(self, x):
        features = self.backbone(x)

        main_feature = features[0]
        features = features[1:]
        out_features = [main_feature]

        # other half of the features pyramid
        # including upsampling to restore the
        # feature maps to the dimensions
        # equal to 1/4 the image size
        for i, feature in enumerate(features):
            feature = self.conv_layers[i](feature)
            feature = self.upsamplers[i](feature)
            out_features.append(feature)

        # concatenate all upsampled features
        x = torch.cat(out_features, dim=1)          # merge at channel dimension
        # perform 2 additional feature extraction
        # and upsampling
        x = self.tconv_layer1(x)
        x = self.tconv_layer2(x)

        # generate the pixel-wise classifier
        x = self.conv_transpose(x)
        x = self.logsoftmax(x)

        return x

## Dataset

In [0]:
# download dataset
###!mkdir 'drive/My Drive/coe197f/dataset'
###!tar -xf 'drive/My Drive/coe197f/drinks.tar.gz' -C 'drive/My Drive/coe197f/dataset'

# !tar -xf 'drive/My Drive/coe197f/drinks.tar.gz'
# !ls

# !cp -r 'drive/My Drive/coe197f/dataset/drinks' .
# !ls

drinks	drive  sample_data


### Define dataset

In [9]:
class SemanticSegmentationDataset(Dataset):
    def __init__(self, data_dir, gt_fname, cuda=True):
        self.data_dir = data_dir
        gt_path = os.path.join(data_dir, gt_fname)

        self.gt_dict = np.load(gt_path,
                               allow_pickle=True).flat[0]
        self.img_names = list(self.gt_dict.keys())

        self.cuda = cuda

    def __len__(self):
        return len(self.img_names)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = self.img_names[idx]
        img_path = os.path.join(self.data_dir, img_name)
        img = PIL.Image.open(img_path)
        img = transforms.ToTensor()(img)    # C, H, W

        gt = self.gt_dict[img_name].transpose(2,0,1)    # C, H, W
        gt = torch.from_numpy(gt).long()

        if cuda:
            img = img.cuda()
            gt = gt.cuda()

        return img, gt  # torch (cuda) CHW
    
    def get_img_names(self, sorted=True):
        """Returns the list of image files in `self.data_dir`,
            but only in that directory (no recursive search)
        """
        img_exts = {'.jpg', '.jpeg', '.png', '.bmp'}

        img_names = []
        _, _, files = next(os.walk(self.img_names))
        for fname in sorted(files):
            if os.path.splitext(fname)[1].lower() in img_exts:
                img_names.append(fname)
        
        return img_names

### Define parameters

In [10]:
batch_size = 1
shuffle = True
epochs = 100
T_save = 10
T_print = 200
num_classes = 4
cuda = True

pretrained_weight_fname = None
weights_dir = 'weights'
data_dir = 'drinks/'
train_gt_fname = 'segmentation_train.npy'
test_gt_fname = 'segmentation_test.npy'
Path(weights_dir).mkdir(parents=True, exist_ok=True)    # make path if not exist

### Initialize dataloaders

In [11]:
trainset = SemanticSegmentationDataset(data_dir, train_gt_fname, cuda=cuda)
trainloader = DataLoader(trainset,
                         batch_size=batch_size,
                         shuffle=shuffle)
# testset = SemanticSegmentationDataset(data_dir, test_gt_fname, cuda=cuda)
# testloader = DataLoader(testset,
#                         batch_size=batch_size,
#                         shuffle=shuffle)

## Training

### Initialize model

In [12]:
channels, height, width = 3, 480, 640
input_shape = (batch_size, channels, height, width)

if cuda:
    backbone = build_resnet(input_shape=input_shape).cuda()
    model = fcn(input_shape, backbone).cuda()
else:
    backbone = build_resnet(input_shape=input_shape)
    model = fcn(input_shape, backbone)


if pretrained_weight_fname:
    restore_weights(model, weight_dir, pretrained_weight_name)

### Functions needed for train loop

In [13]:
# learning rate scheduler
def lr_scheduler(epoch):
    lr = 1e-3
    if epoch > 80:
        lr *= 5e-2
    elif epoch > 60:
        lr *= 1e-1
    elif epoch > 40:
        lr *= 5e-1
    
    print('Learning rate:', lr)

    return lr

In [14]:
def save_model(model, weights_dir, weight_fname):
    weight_path = os.path.join(weights_dir, weight_fname)
    print('Saving weights to', weight_path)
    torch.save(model.state_dict(), weight_path)

In [15]:
def restore_weights(model, weights_dir, weight_fname):
    weight_path = os.path.join(weights_dir, weight_fname)
    print('Restoring weights from', weight_path)
    model.load_state_dict(torch.load(weight_path))
    model.eval()

### Train loop

In [None]:
# based on https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
criterion = nn.CrossEntropyLoss()                   # categorical crossentropy
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_scheduler)

# train loop
T_start = time.time()
for epoch in range(epochs):
    print()
    running_loss = 0.0
    running_loss_mini = 0.0
    for i, (inputs, targets) in enumerate(trainloader):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)

        # remove the channel dimension
        targets = torch.argmax(targets, dim=1) # convert to (N, H, W)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_loss_mini += loss.item()

        if i % T_print == T_print-1:
            T_end = time.time()
            print('%d-th minibatch\tloss: %f' % (i+1, running_loss/T_print))
            print(T_end-T_start, 'secs elapsed')
            running_loss_mini = 0.0

    # call scheduler every epoch
    scheduler.step()

    # print statistics
    T_end = time.time()
    print('epoch %3d\tloss: %f' % (epoch + 1, running_loss))
    print(T_end-T_start, 'secs elapsed')

    # save
    if True:#epoch % T_save == T_save-1:
        weight_fname = f'resnet_v2-{epoch+1}epoch.pth'
        save_model(model, weights_dir, weight_fname)

print('Done!')

Learning rate: 0.001



  "See the documentation of nn.Upsample for details.".format(mode))


200-th minibatch	loss: 4.581914
85.96934747695923 secs elapsed
400-th minibatch	loss: 8.212995
171.13247966766357 secs elapsed
600-th minibatch	loss: 11.014925
256.33518290519714 secs elapsed
800-th minibatch	loss: 13.149916
341.4132831096649 secs elapsed
1000-th minibatch	loss: 14.704585
426.43683218955994 secs elapsed
Learning rate: 0.001
epoch   1	loss: 2940.917053
426.4369378089905 secs elapsed
Saving weights to weights/resnet_v2-1epoch.pth

200-th minibatch	loss: 1.198146
510.89235186576843 secs elapsed
400-th minibatch	loss: 2.177213
595.6450202465057 secs elapsed
600-th minibatch	loss: 2.979715
679.6575660705566 secs elapsed
800-th minibatch	loss: 3.721677
763.6524167060852 secs elapsed
1000-th minibatch	loss: 4.373766
848.0936834812164 secs elapsed
Learning rate: 0.001
epoch   2	loss: 874.753138
848.0937876701355 secs elapsed
Saving weights to weights/resnet_v2-2epoch.pth

200-th minibatch	loss: 0.616056
932.5380051136017 secs elapsed
400-th minibatch	loss: 1.221732
1016.937416

800-th minibatch	loss: 0.651412
7934.737657546997 secs elapsed
1000-th minibatch	loss: 0.822142
8019.1044244766235 secs elapsed
Learning rate: 0.001
epoch  19	loss: 164.428481
8019.104849815369 secs elapsed
Saving weights to weights/resnet_v2-19epoch.pth

200-th minibatch	loss: 0.146913
8103.510803222656 secs elapsed
400-th minibatch	loss: 0.307431
8187.867605686188 secs elapsed
600-th minibatch	loss: 0.465821
8272.181189775467 secs elapsed
800-th minibatch	loss: 0.621843
8356.51589512825 secs elapsed
1000-th minibatch	loss: 0.785738
8440.878652095795 secs elapsed
Learning rate: 0.001
epoch  20	loss: 157.147622
8440.878759860992 secs elapsed
Saving weights to weights/resnet_v2-20epoch.pth

200-th minibatch	loss: 0.151436
8525.328181505203 secs elapsed
400-th minibatch	loss: 0.298677
8609.737678050995 secs elapsed
600-th minibatch	loss: 0.449831
8694.12981390953 secs elapsed
800-th minibatch	loss: 0.591007
8778.481040716171 secs elapsed
1000-th minibatch	loss: 0.752329
8862.850555181503

200-th minibatch	loss: 0.074677
15696.986911773682 secs elapsed
400-th minibatch	loss: 0.154544
15781.312609434128 secs elapsed
600-th minibatch	loss: 0.232511
15865.730207443237 secs elapsed
800-th minibatch	loss: 0.305579
15950.112293481827 secs elapsed
1000-th minibatch	loss: 0.389460
16034.49278640747 secs elapsed
Learning rate: 0.001
epoch  38	loss: 77.891923
16034.492938756943 secs elapsed
Saving weights to weights/resnet_v2-38epoch.pth

200-th minibatch	loss: 0.072700
16118.942483901978 secs elapsed
400-th minibatch	loss: 0.149137
16203.38045668602 secs elapsed
600-th minibatch	loss: 0.224256
16287.765153169632 secs elapsed
800-th minibatch	loss: 0.302450
16372.22155880928 secs elapsed
1000-th minibatch	loss: 0.376041
16456.679702043533 secs elapsed
Learning rate: 0.001
epoch  39	loss: 75.208233
16456.679806232452 secs elapsed
Saving weights to weights/resnet_v2-39epoch.pth

200-th minibatch	loss: 0.071803
16541.132654190063 secs elapsed
400-th minibatch	loss: 0.145720
16625.448

800-th minibatch	loss: 0.211088
23545.906116485596 secs elapsed
1000-th minibatch	loss: 0.265667
23630.32190132141 secs elapsed
Learning rate: 0.0005
epoch  56	loss: 53.133361
23630.322004318237 secs elapsed
Saving weights to weights/resnet_v2-56epoch.pth

200-th minibatch	loss: 0.051077
23714.76241159439 secs elapsed
400-th minibatch	loss: 0.105874
23799.14624261856 secs elapsed
600-th minibatch	loss: 0.156253
23883.488463163376 secs elapsed
800-th minibatch	loss: 0.208023
23967.85254764557 secs elapsed
1000-th minibatch	loss: 0.261174
24052.23149752617 secs elapsed
Learning rate: 0.0005
epoch  57	loss: 52.234721
24052.23162150383 secs elapsed
Saving weights to weights/resnet_v2-57epoch.pth

200-th minibatch	loss: 0.050435
24136.650026082993 secs elapsed
400-th minibatch	loss: 0.102245
24221.02795982361 secs elapsed
600-th minibatch	loss: 0.152212
24305.286283493042 secs elapsed
800-th minibatch	loss: 0.204559
24389.654420375824 secs elapsed
1000-th minibatch	loss: 0.256772
24474.0657

200-th minibatch	loss: 0.045781
31309.98774266243 secs elapsed
400-th minibatch	loss: 0.090508
31394.32215833664 secs elapsed
600-th minibatch	loss: 0.135126
31478.72291326523 secs elapsed
800-th minibatch	loss: 0.183593
31563.16175341606 secs elapsed
1000-th minibatch	loss: 0.229630
31647.579665660858 secs elapsed
Learning rate: 0.0001
epoch  75	loss: 45.925988
31647.579767227173 secs elapsed
Saving weights to weights/resnet_v2-75epoch.pth

200-th minibatch	loss: 0.045970
31732.019272089005 secs elapsed
400-th minibatch	loss: 0.091682
31816.464767456055 secs elapsed
600-th minibatch	loss: 0.137188
31900.890671491623 secs elapsed
800-th minibatch	loss: 0.185044
31985.282541036606 secs elapsed
1000-th minibatch	loss: 0.228837
32069.682838201523 secs elapsed
Learning rate: 0.0001
epoch  76	loss: 45.767320
32069.682941436768 secs elapsed
Saving weights to weights/resnet_v2-76epoch.pth

200-th minibatch	loss: 0.044655
32154.15345954895 secs elapsed
400-th minibatch	loss: 0.092136
32238.571

In [0]:
epoch = 3
weight_fname = f'resnet_v2-{epoch+1}th-epoch.pth'
save_model(model, weights_dir, weight_fname)

Saving weights to drive/My Drive/coe197f/weights/resnet_v2-4th-epoch.pth


In [0]:
!ls -lht 'drive/My Drive/coe197f/weights'

total 59M
-rw------- 1 root root 59M May 29 09:14 resnet_v2-4th-epoch.pth


## Testing

In [0]:
# N,C,H,W = 1, 3, 480, 640
N,C,H,W = 2,3,128,128
backbone = build_resnet(input_shape=(N,C,H,W))

256


In [0]:
r_in = torch.zeros(N,C,H,W)
b_out = backbone(r_in)
for b in b_out:
    print(b.shape)

------x.shape = torch.Size([2, 3, 128, 128])
torch.Size([2, 256, 32, 32])
torch.Size([2, 256, 16, 16])
torch.Size([2, 512, 8, 8])
torch.Size([2, 512, 4, 4])
torch.Size([2, 512, 2, 2])


In [0]:
model = fcn(r_in.shape, backbone)

In [0]:
out = model(r_in)
print(out.shape)

------x.shape = torch.Size([2, 3, 128, 128])


  "See the documentation of nn.Upsample for details.".format(mode))


torch.Size([2, 4, 128, 128])


