In [1]:
from torchvision import utils
from dataloader import *
from utils import *
import torchvision
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torch.autograd import Variable
import torch
import time
from basic_fcn import FCN
from datetime import datetime

In [2]:
import matplotlib.pyplot as plt
import numpy as np

# functions to show an image


def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

In [3]:
# Change name to FCN to use this model instead I think

class FCN_bak(torch.nn.Module):

    def __init__(self, n_class):
        super(FCN_bak, self).__init__()
        self.n_class = n_class
        self.conv1   = nn.Conv2d(3, 32, kernel_size=(3,5), stride=(2,4), padding=1, dilation=1)
        self.bnd1    = nn.BatchNorm2d(32)
        self.conv2   = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1, dilation=1)
        self.bnd2    = nn.BatchNorm2d(64)
        self.conv3   = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1, dilation=1)
        self.bnd3    = nn.BatchNorm2d(128)
        self.conv4   = nn.Conv2d(128,256, kernel_size=3, stride=2, padding=1, dilation=1)
        self.bnd4    = nn.BatchNorm2d(256)
        self.conv5   = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, dilation=1)
        self.bnd5    = nn.BatchNorm2d(512)
        self.relu    = nn.ReLU(inplace=True)
        
        self.deconv1 = nn.ConvTranspose2d(512, 256, kernel_size=3, stride=2, padding=1, dilation=1, output_padding=1)
        self.bn1     = nn.BatchNorm2d(256)
        self.deconv2 = nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, dilation=1, output_padding=1)
        self.bn2     = nn.BatchNorm2d(128)
        self.deconv3 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, dilation=1, output_padding=1)
        self.bn3     = nn.BatchNorm2d(64)
        self.deconv4 = nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, dilation=1, output_padding=1)
        self.bn4     = nn.BatchNorm2d(32)
        self.deconv5  = nn.ConvTranspose2d(32, 3, kernel_size=(3, 5), stride=(2,4), padding=1, dilation=1, output_padding=1)
        self.bn5= nn.BatchNorm2d(3)
        self.classifier = nn.Conv2d(3,n_class, kernel_size=1, stride=1, padding=0, dilation=1)
        
    def forward(self, x):
        pool = nn.MaxPool2d(2, stride=2,return_indices = True)
        unpool = nn.MaxUnpool2d(2, stride=2)
        
        x1, indice1 = pool(self.relu(self.conv1(x)))
        x2, indice2 = pool(self.relu(self.conv2(self.bnd1(x1))))
        x3, indice3 = pool(self.relu(self.conv3(self.bnd2(x2))))
        x4, indice4 = pool(self.relu(self.conv4(self.bnd3(x3))))
        x5, indice5 = pool(self.relu(self.conv5(self.bnd4(x4))))
        
        z1 = self.deconv1(self.bnd5(self.relu(unpool((x5), indice5))))
        z2 = self.deconv2(self.bn1(self.relu(unpool((z1), indice4))))
        z3 = self.deconv3(self.bn2(self.relu(unpool((z2), indice3))))
        z4 = self.deconv4(self.bn3(self.relu(unpool((z3), indice2))))
        z5 = self.deconv5(self.bn4(self.relu(unpool((z4), indice1))))
        
        out_decoder = self.classifier(self.bn5(z5))                  

        return out_decoder  # size=(N, n_class, x.H/1, x.W/1)

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
def print_GPU_stats():
    print("total GPU Mem: ", torch.cuda.get_device_properties(device).total_memory)
    print("total GPU Cached: ", torch.cuda.memory_cached(device))
    print("total GPU Allocated: ", torch.cuda.memory_allocated(device))
    print("Available GB: ", (torch.cuda.get_device_properties(device).total_memory - torch.cuda.memory_allocated(device))/(10**9))
print_GPU_stats()

cuda:0
total GPU Mem:  11721506816
total GPU Cached:  0
total GPU Allocated:  0
Available GB:  11.721506816


In [5]:
batch_size = 5
train_dataset = CityScapesDataset(csv_file='train.csv')
val_dataset = CityScapesDataset(csv_file='val.csv')
test_dataset = CityScapesDataset(csv_file='test.csv')
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          num_workers=6,
                          shuffle=True)
val_loader = DataLoader(dataset=val_dataset,
                          batch_size=batch_size,
                          num_workers=6,
                          shuffle=True)
test_loader = DataLoader(dataset=test_dataset,
                          batch_size=batch_size,
                          num_workers=6,
                          shuffle=True)

In [6]:
def init_weights(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
        torch.nn.init.xavier_uniform_(m.weight.data)
#         torch.nn.init.xavier_uniform(m.bias.data)
        m.bias.data.zero_()
epochs     = 100
start_epoch = 9
#criterion = torch.nn.MSELoss()
criterion = torch.nn.CrossEntropyLoss()
fcn_model = FCN_bak(n_class=34)
# fcn_model.apply(init_weights)
fcn_model.load_state_dict(torch.load('best_model_02_10_22_04.pt'))
optimizer = optim.Adam(fcn_model.parameters(), lr=5e-3)

In [7]:
dt = datetime.now().strftime("%m_%d_%H_%M")
output_fn = "model_output_" + dt + ".txt"
best_model_fn = "best_model_" + dt + ".pt"

def print_info(out_str):
    f = open(output_fn,"a")
    print(out_str)
    f.write(out_str)
    f.close()

print_info("Started: %s\nFrom a previously trained model which left off on start of epoch 9.\n" % datetime.now())

Started: 2020-02-11 03:50:05.235928
From a previously trained model which left off on start of epoch 9.



In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu:
    fcn_model = fcn_model.to(device)
    
best_loss = float('inf')
prev_loss = float('inf')
loss_inc_cnt = 0
stop_early = False

def train():
    softmax = nn.Softmax(dim=1)
    print("Starting Training")

    for epoch in range(start_epoch, epochs):
        
        ts = time.time()
        for iter, (X, tar, Y) in enumerate(train_loader):
            optimizer.zero_grad()

            if use_gpu:
                inputs = X.to(device)
                labels_cat = Y.to(device)
            else:
                inputs, labels_cat, labels_enc = X, Y, tar

            outputs = softmax(fcn_model(inputs))
            loss = criterion(outputs, labels_cat)
            loss.backward()
            optimizer.step()

            if iter % 10 == 0:
                print_info("epoch{}, iter{}, loss: {} \n".format(epoch, iter, loss.item()))
                
        
        print_info("Finish epoch {}, time elapsed {} \n".format(epoch, time.time() - ts))
    
        loss, acc, IoU = evaluate(train_loader)

        print_info("Training Check:\tLoss: %f\tAccuracy: %f\tIoU: %f \n" % (loss, acc * 100, IoU))
        
        val(epoch)
        if stop_early: return
  
#         fcn_model.train()

def evaluate(data_loader, validation=False, verbose=False):

    global best_loss
    global prev_loss
    global loss_inc_cnt
    global stop_early
    
    with torch.no_grad():
        losses = []
        accs = []
        ious = []
        softmax = nn.Softmax(dim=1)
        ts = time.time()
        print("Starting Evaluation")
        
        for iter, (X, tar, Y) in enumerate(data_loader):

            if use_gpu:
                inputs = X.to(device)
                labels_cat = Y.to(device)
            else:
                inputs, labels_cat, labels_enc = X, Y, tar

            outputs = fcn_model(inputs)
            outputs = softmax(outputs)

            output_labels = outputs.argmax(dim=1)

            losses.append(criterion(outputs, labels_cat).item())

            accs.append(pixel_acc(output_labels, labels_cat))

            ious.append(np.nanmean(iou(output_labels, labels_cat)))

            loss = np.mean(losses)
            acc = np.mean(accs)
            IoU = np.mean(ious)

            if verbose: print("Batch %d:\tLoss: %f\tAccuracy: %f\tIoU: %f" % (iter, loss, acc * 100, IoU))
            

        print("Finished evaluation. Time elapsed %f" % (time.time() - ts))

        # This probably should not be a straight average, but just doing this for now
        loss = np.mean(losses)
        acc = np.mean(accs)
        IoU = np.mean(ious)
        
        if validation:
            if best_loss > loss:
                best_loss = loss
                print_info("Best Loss: " + str(best_loss) + "\n")
                torch.save(fcn_model.state_dict(), best_model_fn)
            loss_inc_cnt = loss_inc_cnt + 1 if prev_loss < loss else 0
            if loss_inc_cnt > 3: stop_early = True
        
        return loss, acc, IoU

def val(epoch):
    # fcn_model.eval()
    # Complete this function - Calculate loss, accuracy and IoU for every epoch
    # Make sure to include a softmax after the output from your model
    loss, acc, IoU = evaluate(val_loader, validation=True)
    print_info("Validation Results: Loss: %f\tAccuracy: %f\tIoU: %f \n" % (loss, acc * 100, IoU))
    if stop_early: print_info("Epoch %d:\tStopping Early" % (epoch))
    
def test():
    print(' ')
    # Complete this function - Calculate accuracy and IoU 
    # Make sure to include a softmax after the output from your model
    loss, acc, IoU = evaluate(test_loader)
    print_info("Test Results:\tLoss: %f\tAccuracy: %f\tIoU: %f \n" % (loss, acc * 100, IoU))
    
if __name__ == "__main__":
#     val(0)  # show the accuracy before training
#     print_info("---------Above is accuracy before training.---------\n")
    train()
    test()

Starting Training
epoch9, iter0, loss: 2.9501595497131348 

epoch9, iter10, loss: 2.941798686981201 

epoch9, iter20, loss: 2.965298652648926 

epoch9, iter30, loss: 3.005303382873535 

epoch9, iter40, loss: 3.0456883907318115 

epoch9, iter50, loss: 2.896493673324585 

epoch9, iter60, loss: 2.9909722805023193 

epoch9, iter70, loss: 2.9460575580596924 

epoch9, iter80, loss: 3.0054445266723633 

epoch9, iter90, loss: 3.000957727432251 

epoch9, iter100, loss: 2.9923863410949707 

epoch9, iter110, loss: 3.0024466514587402 

epoch9, iter120, loss: 3.05564546585083 

epoch9, iter130, loss: 3.0212368965148926 

epoch9, iter140, loss: 2.936124801635742 

epoch9, iter150, loss: 2.988154172897339 

epoch9, iter160, loss: 2.955958843231201 

epoch9, iter170, loss: 2.9196255207061768 

epoch9, iter180, loss: 2.9580237865448 

epoch9, iter190, loss: 3.0395750999450684 

epoch9, iter200, loss: 3.017932891845703 

epoch9, iter210, loss: 3.0255093574523926 

epoch9, iter220, loss: 2.97843766212463

epoch11, iter500, loss: 2.9576776027679443 

epoch11, iter510, loss: 2.9894256591796875 

epoch11, iter520, loss: 2.933023452758789 

epoch11, iter530, loss: 2.8976430892944336 

epoch11, iter540, loss: 2.9031898975372314 

epoch11, iter550, loss: 2.935774564743042 

epoch11, iter560, loss: 2.885760545730591 

epoch11, iter570, loss: 2.9440040588378906 

epoch11, iter580, loss: 2.9593465328216553 

epoch11, iter590, loss: 3.0178024768829346 

Finish epoch 11, time elapsed 1366.168613910675 

Starting Evaluation
Finished evaluation. Time elapsed 1436.008109
Training Check:	Loss: 2.965637	Accuracy: 61.045176	IoU: 0.069857 

Starting Evaluation
Finished evaluation. Time elapsed 247.090788
Best Loss: 2.971919884681702

Validation Results: Loss: 2.971920	Accuracy: 60.417435	IoU: 0.067947 

epoch12, iter0, loss: 3.0201528072357178 

epoch12, iter10, loss: 2.9419302940368652 

epoch12, iter20, loss: 3.0145716667175293 

epoch12, iter30, loss: 2.897193670272827 

epoch12, iter40, loss: 2.95154

epoch14, iter330, loss: 2.9778034687042236 

epoch14, iter340, loss: 2.9488208293914795 

epoch14, iter350, loss: 3.063551425933838 

epoch14, iter360, loss: 2.9400269985198975 

epoch14, iter370, loss: 2.9524025917053223 

epoch14, iter380, loss: 2.9253649711608887 

epoch14, iter390, loss: 3.0345895290374756 

epoch14, iter400, loss: 2.9916625022888184 

epoch14, iter410, loss: 2.9111666679382324 

epoch14, iter420, loss: 2.9593634605407715 

epoch14, iter430, loss: 2.930403470993042 

epoch14, iter440, loss: 2.944467782974243 

epoch14, iter450, loss: 2.9555258750915527 

epoch14, iter460, loss: 3.0133092403411865 

epoch14, iter470, loss: 2.996133804321289 

epoch14, iter480, loss: 2.936609983444214 

epoch14, iter490, loss: 3.074889659881592 

epoch14, iter500, loss: 3.0003089904785156 

epoch14, iter510, loss: 3.00288724899292 

epoch14, iter520, loss: 2.9109764099121094 

epoch14, iter530, loss: 2.987257242202759 

epoch14, iter540, loss: 2.9396862983703613 

epoch14, iter550, l

epoch17, iter170, loss: 2.9125142097473145 

epoch17, iter180, loss: 2.960763454437256 

epoch17, iter190, loss: 2.907597780227661 

epoch17, iter200, loss: 2.986754894256592 

epoch17, iter210, loss: 2.93454647064209 

epoch17, iter220, loss: 2.9923348426818848 

epoch17, iter230, loss: 2.969949245452881 

epoch17, iter240, loss: 2.997894048690796 

epoch17, iter250, loss: 2.991730213165283 

epoch17, iter260, loss: 3.026193141937256 

epoch17, iter270, loss: 2.9891841411590576 

epoch17, iter280, loss: 3.006556272506714 

epoch17, iter290, loss: 2.99703049659729 

epoch17, iter300, loss: 2.950599431991577 

epoch17, iter310, loss: 2.933583974838257 

epoch17, iter320, loss: 2.9980721473693848 

epoch17, iter330, loss: 3.0574707984924316 

epoch17, iter340, loss: 2.9793028831481934 

epoch17, iter350, loss: 3.0018606185913086 

epoch17, iter360, loss: 2.9870638847351074 

epoch17, iter370, loss: 2.999814748764038 

epoch17, iter430, loss: 2.933197259902954 

epoch17, iter440, loss: 2.