Loss exploding after few steps #5

learnermaxRL · 2019-07-17T13:46:31Z

`from functools import reduce
from operator import mul
from typing import Tuple
import numpy as np
import torch
import torchvision
import torch.nn as nn
from models.loss_functions.lsaloss import LSALoss
from models.base import BaseModule
from models.blocks_2d import DownsampleBlock
from models.blocks_2d import ResidualBlock
from models.blocks_2d import UpsampleBlock
from models.estimator_1D import Estimator1D
import cv2

class Encoder(BaseModule):
"""
CIFAR10 model encoder.
"""
def init(self, input_shape, code_length):
# type: (Tuple[int, int, int], int) -> None
"""
Class constructor:

    :param input_shape: the shape of CIFAR10 samples.
    :param code_length: the dimensionality of latent vectors.
    """
    super(Encoder, self).__init__()

    self.input_shape = input_shape
    self.code_length = code_length

    c, h, w = input_shape

    print (c,h,w)

    activation_fn = nn.LeakyReLU()

    # Convolutional network
    self.conv = nn.Sequential(
        nn.Conv2d(in_channels=c, out_channels=32, kernel_size=3, bias=False),
        activation_fn,
        ResidualBlock(channel_in=32, channel_out=32, activation_fn=activation_fn),
        DownsampleBlock(channel_in=32, channel_out=64, activation_fn=activation_fn),
        DownsampleBlock(channel_in=64, channel_out=128, activation_fn=activation_fn),
        DownsampleBlock(channel_in=128, channel_out=256, activation_fn=activation_fn),
    )
    self.deepest_shape = (256, h // 8, w // 8)

    # FC network
    self.fc = nn.Sequential(
        nn.Linear(in_features=reduce(mul, self.deepest_shape), out_features=256),
        nn.BatchNorm1d(num_features=256),
        activation_fn,
        nn.Linear(in_features=256, out_features=code_length),
        nn.Sigmoid()
    )

def forward(self, x):
    # types: (torch.Tensor) -> torch.Tensor
    """
    Forward propagation.

    :param x: the input batch of images.
    :return: the batch of latent vectors.
    """
    h = x
    print (type(h))
    h = self.conv(h)
    h = h.view(len(h), -1)
    o = self.fc(h)

    return o

class Decoder(BaseModule):
"""
CIFAR10 model decoder.
"""
def init(self, code_length, deepest_shape, output_shape):
# type: (int, Tuple[int, int, int], Tuple[int, int, int]) -> None
"""
Class constructor.

    :param code_length: the dimensionality of latent vectors.
    :param deepest_shape: the dimensionality of the encoder's deepest convolutional map.
    :param output_shape: the shape of CIFAR10 samples.
    """
    super(Decoder, self).__init__()

    self.code_length = code_length
    self.deepest_shape = deepest_shape
    self.output_shape = output_shape

    print (self.output_shape,"--")

    activation_fn = nn.LeakyReLU()

    # FC network
    self.fc = nn.Sequential(
        nn.Linear(in_features=code_length, out_features=256),
        nn.BatchNorm1d(num_features=256),
        activation_fn,
        nn.Linear(in_features=256, out_features=reduce(mul, deepest_shape)),
        nn.BatchNorm1d(num_features=reduce(mul, deepest_shape)),
        activation_fn
    )

    # Convolutional network
    self.conv = nn.Sequential(
        UpsampleBlock(channel_in=256, channel_out=128, activation_fn=activation_fn),
        UpsampleBlock(channel_in=128, channel_out=64, activation_fn=activation_fn),
        UpsampleBlock(channel_in=64, channel_out=32, activation_fn=activation_fn),
        ResidualBlock(channel_in=32, channel_out=32, activation_fn=activation_fn),
        nn.Conv2d(in_channels=32, out_channels=3, kernel_size=1, bias=False)
    )

def forward(self, x):
    # types: (torch.Tensor) -> torch.Tensor
    """
    Forward propagation.

    :param x: the batch of latent vectors.
    :return: the batch of reconstructions.
    """
    h = x
    h = self.fc(h)
    h = h.view(len(h), *self.deepest_shape)
    h = self.conv(h)
    o = h

    return o

class LSACIFAR10(BaseModule):
"""
LSA model for CIFAR10 one-class classification.
"""
def init(self, input_shape, code_length, cpd_channels):
# type: (Tuple[int, int, int], int, int) -> None
"""
Class constructor.

    :param input_shape: the shape of CIFAR10 samples.
    :param code_length: the dimensionality of latent vectors.
    :param cpd_channels: number of bins in which the multinomial works.
    """
    super(LSACIFAR10, self).__init__()

    self.input_shape = input_shape
    self.code_length = code_length

    # Build encoder
    self.encoder = Encoder(
        input_shape=input_shape,
        code_length=code_length
    )

    # Build decoder
    self.decoder = Decoder(
        code_length=code_length,
        deepest_shape=self.encoder.deepest_shape,
        output_shape=input_shape
    )

    # Build estimator
    self.estimator = Estimator1D(
        code_length=code_length,
        fm_list=[32, 32, 32, 32],
        cpd_channels=cpd_channels
    )

def forward(self, x):
    # type: (torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
    """
    Forward propagation.

    :param x: the input batch of images.
    :return: a tuple of torch.Tensors holding reconstructions, latent vectors and CPD estimates.
    """
    h = x

    # Produce representations
    z = self.encoder(h)

    # Estimate CPDs with autoregression
    z_dist = self.estimator(z)

    # Reconstruct x
    x_r = self.decoder(z)
    # print (x_r.shape)
    x_r = x_r.view(-1, *self.input_shape)

    return x_r, z, z_dist

def load_dataset(data_path="/home/jbmai/Downloads/Defect Images-20190705T133320Z-001"):
# data_path = 'data/train/'

torchvision.transforms.Grayscale(num_output_channels=1)

trainTransform  = torchvision.transforms.Compose([
                                torchvision.transforms.Resize(size=(128,128), interpolation=2),
                                torchvision.transforms.ToTensor(), 
                                ])




train_dataset = torchvision.datasets.ImageFolder(
    root=data_path,
    transform=trainTransform)


train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=64,
    num_workers=0,
    shuffle=True
)
return train_loader

net = LSACIFAR10(input_shape=[3,128,128],code_length = 32,cpd_channels =100)
lossFunction = LSALoss(cpd_channels=100)
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

try:

checkpoint = torch.load("savedWeights/enc.pth")
net.encoder.load_state_dict(checkpoint)

checkpoint = torch.load("savedWeights/est.pth")
net.estimator.load_state_dict(checkpoint)

checkpoint = torch.load("savedWeights/dec.pth")
net.decoder.load_state_dict(checkpoint)

except Exception as e:
print (e)

for epoch in range(1000): # loop over the dataset multiple times

running_loss = 0.0
d = load_dataset()
for i, (data,l) in enumerate(d):
    # get the inputs; data is a list of [inputs, labels]
    
    # print (data.shape)

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    # print (data)
    x_r,z,z_dist = net.forward(data)
    # print (x_r.shape)
    # print(data.shape)
    loss = lossFunction(data,x_r,z,z_dist)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()
    if i % 5 == 0:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
              (epoch + 1, i + 1, running_loss / 5))
        running_loss = 0.0
if (epoch % 5)== 0 :
    # print ("--------------------{} epoch-----------".format(epoch))
    # net.encoder.eval()
    # net.estimator.eval()
    # net.decoder.eval()

    # z = net.encoder(data)
    # z_dist = net.estimator(z)
    # x_r = net.decoder(z).permute(0,2,3,1).detach().numpy()
    # out =x_r
    # print (type(out))
    # for i in range(out.shape[0]):
    #     # print (out.shape)
    #     # # out.permute(0,2,3,1)
    #     # print (out.shape)
    #     cv2.imwrite("constructedImages/outDec{}_{}.jpg".format(epoch,i),out[i,:,:,:]*255)
    #     # cv2.waitKey(0)
       
    #     # cv2.
    # net.encoder.train()
    # net.estimator.train()
    # net.decoder.eval()
    torch.save(net.encoder.state_dict(),("savedWeights/enc.pth"))
    torch.save(net.estimator.state_dict(),("savedWeights/est.pth"))
    torch.save(net.decoder.state_dict(),("savedWeights/dec.pth"))

print('Finished Training')`

Output :

<class 'torch.Tensor'>
[1, 1] loss: 727109273.600
<class 'torch.Tensor'>
[2, 1] loss: 2495627954514337382531072.000

Hi can you help me rectify the issue.

The text was updated successfully, but these errors were encountered:

learnermaxRL · 2019-07-18T07:17:08Z

Managed to correct above by changing hyperparameters,but after few steps i get
[1, 71] loss: 3293.943 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 76] loss: 3096.135 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 81] loss: 3161.412 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 86] loss: 3176.714 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 91] loss: 2969.099 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 96] loss: 3247.497 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 101] loss: 3068.246 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 106] loss: 2769.233 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 111] loss: 2766.316 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 116] loss: 2718.537 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 121] loss: 3055.004 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 126] loss: 2576.473 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 131] loss: 2947.091 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 136] loss: 2869.674 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 141] loss: 2717.064 <class 'torch.Tensor'> Traceback (most recent call last): File "modelTrainer.py", line 261, in <module> x_r,z,z_dist = net.forward(data) File "modelTrainer.py", line 188, in forward z = self.encoder(h) File "/home/jbmai/DefectsDetection/NoveltyDetection/models/base.py", line 33, in __call__ return super(BaseModule, self).__call__(*args, **kwargs) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "modelTrainer.py", line 73, in forward o = self.fc(h) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward input = module(input) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 83, in forward exponential_average_factor, self.eps) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/functional.py", line 1693, in batch_norm raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size)) ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 256])
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 256])

DavideA · 2019-07-18T07:23:43Z

Hi @learnermaxRL

I would advise using Adam for optimization.

As for the latter error you reported, it is likely to be due to a singleton batch (i.e., a batch with only one sample). It may be due to the DataLoader, try setting the drop_last flag to True.

D

learnermaxRL · 2019-07-18T07:50:41Z

Thanks,corrected it,however i can see that reconstructed images have negative values in the tensor,is it desirable?

sample slice of x_r *255 = [ 19.986425 33.786083 109.08704 ]]

[[ 49.809772 -32.651962 -1.5576267]
[ 53.66301 -72.02914 48.711018 ]
[ 39.252117 -81.27754 75.4854 ]
...
[ 21.410696 -71.10042 68.18309 ]
[ -8.615957 -179.66095 8.810505 ]
[ 44.986786 29.80011 93.024506 ]]

[[ -68.59759 64.74513 51.421898 ]
[ -18.552599 29.491028 69.56346 ]
[ -49.379646 45.368095 29.86158 ]
...
[ -84.07668 41.966274 100.4433 ]
[ 1.8273218 58.350666 60.632793 ]

If yes than how to get back RGB image from them?

DavideA · 2019-07-18T08:31:09Z

Negative values are not undesirable per se, as long as it is a concious choice.
What is the range of your input images?

D

learnermaxRL · 2019-07-18T11:40:19Z

input images are RGB images with standard 0-255 range being fed through dataloader pytorch
`def load_dataset(data_path="/home/ji/Downloads/aug/"):
# data_path = 'data/train/'

torchvision.transforms.Grayscale(num_output_channels=1)

trainTransform  = torchvision.transforms.Compose([
                                torchvision.transforms.Resize(size=(128,128), interpolation=2),
                                torchvision.transforms.ToTensor(), 
                                ])




train_dataset = torchvision.datasets.ImageFolder(
    root=data_path,
    transform=trainTransform)


train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=16,
    num_workers=0,
    shuffle=True,
    drop_last=True
    # pin_memory=True
)
return train_loader

`

DavideA · 2019-07-18T11:46:37Z

I would advise standardizing input images.
E.g., try making each channel zero-mean and unit std.

D

learnermaxRL · 2019-07-18T11:49:19Z

that makes sense,however standardization wouldnt guarantee negative output right,?

learnermaxRL · 2019-07-18T11:53:07Z

Also i checked my input data is between 0 and 1 already,

Pytorch default backend 1 for images are Pillow, and when you use ToTensor() 2 class, PyTorch automatically converts all images into [0,1].

So shouldn't the reconstruction output be standardized before calculating reconstruction loss or perhaps use another activation function?

DavideA · 2019-07-18T11:57:20Z

I would advise using a linear activation function for the reconstruction and providing the groundtruth image to the same loss in the same range as the input.

learnermaxRL · 2019-07-18T12:03:11Z

Are you talking about the x_r from decoder,should i use sigmoid on top of last layer in decoder output,, i am trying to train the model to learn the reconstruction of the image itself,so GT will be the same image
x_r,z,z_dist = net.forward(data) # print (x_r.shape) # print(data.shape) loss = lossFunction.forward(data,x_r,z,z_dist) loss.backward() optimizer.step()
Is there anything else I need to do?

learnermaxRL · 2019-07-18T12:03:56Z

net = LSACIFAR10(input_shape=[3,128,128],code_length = 64,cpd_channels =100) lossFunction = LSALoss(cpd_channels=100)

DavideA · 2019-07-18T12:28:10Z

The sigmoid is not mandatory. I would advise not to use it.

The rest of the code seems fine. Would an [0-1] input deliver those reconstructions?

D

learnermaxRL · 2019-07-18T12:30:28Z

Sorry didnt understand that part,my input is already [0,1],its the reconstruction which has negative values

DavideA · 2019-07-18T12:31:53Z

Does the reconstruction loss go down?

learnermaxRL · 2019-07-18T12:32:01Z

How have you done the reconstruction of image,as in what was your input range in cifar images for instance and what was the activation function of decoder and range of reconstructed output?

learnermaxRL · 2019-07-18T12:34:06Z

It has high values ~2500,since i am using sigmoid the learning is pretty slow,i suspect the gradients are very less due to higher output before sigmoid.After 100 epochs on dataset of size 1400 with batch size 64 the loss only decrease by approx 50 units.

Can you guide me as to what have you done

DavideA · 2019-07-18T12:35:34Z

range of CIFAR-10 images: [0-1]
decoder activation function: None
range of the reconstruction: approximately [0-1]

learnermaxRL · 2019-07-18T12:36:54Z

I see ,but how come the values are negative in my case,i mean weights arent negative,my input [0,1] no non linearity applied on top of your code,what is going wrong here?

DavideA · 2019-07-18T12:40:09Z

I saw you are using 128x128 images. The number of downsampling in the model is tuned on 32x32 images. You might then have a huge linear layer before the bottleneck of the autoencoder, with many parameters slowing down learning.

As for the negative values, it is really weird. I would try optimizing the reconstruction loss only (plain autoencoder) and see if the problem fades.

learnermaxRL · 2019-07-18T12:42:22Z

Thanks,I ll let you know the progress,let me lighten up the model a bit.. :)

DavideA · 2019-07-18T12:46:31Z

Keep in mind that, if I am right, the best way to lighten up the model would be to add downsample and upsample blocks.

learnermaxRL · 2019-07-19T07:08:02Z

Yeah sure will do,but how do I deal with reconstruction with negative values,using sigmoid is the last option I would prefer.Any help?

Did you encounter negative values while reconstruction ,if yes how did you deal with it,(apart from using activation),is standardizing the x_r right at inference time/validation time?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Loss exploding after few steps #5

Loss exploding after few steps #5

learnermaxRL commented Jul 17, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019 •

edited

Loading

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019 •

edited

Loading

learnermaxRL commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 19, 2019 •

edited

Loading

Loss exploding after few steps #5

Loss exploding after few steps #5

Comments

learnermaxRL commented Jul 17, 2019

torchvision.transforms.Grayscale(num_output_channels=1)

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019 • edited Loading

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

torchvision.transforms.Grayscale(num_output_channels=1)

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019 • edited Loading

learnermaxRL commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 18, 2019

DavideA commented Jul 18, 2019

learnermaxRL commented Jul 19, 2019 • edited Loading

learnermaxRL commented Jul 18, 2019 •

edited

Loading

DavideA commented Jul 18, 2019 •

edited

Loading

learnermaxRL commented Jul 19, 2019 •

edited

Loading