Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Loss exploding after few steps #5

Open
learnermaxRL opened this issue Jul 17, 2019 · 22 comments
Open

Loss exploding after few steps #5

learnermaxRL opened this issue Jul 17, 2019 · 22 comments

Comments

@learnermaxRL
Copy link

`from functools import reduce
from operator import mul
from typing import Tuple
import numpy as np
import torch
import torchvision
import torch.nn as nn
from models.loss_functions.lsaloss import LSALoss
from models.base import BaseModule
from models.blocks_2d import DownsampleBlock
from models.blocks_2d import ResidualBlock
from models.blocks_2d import UpsampleBlock
from models.estimator_1D import Estimator1D
import cv2

class Encoder(BaseModule):
"""
CIFAR10 model encoder.
"""
def init(self, input_shape, code_length):
# type: (Tuple[int, int, int], int) -> None
"""
Class constructor:

    :param input_shape: the shape of CIFAR10 samples.
    :param code_length: the dimensionality of latent vectors.
    """
    super(Encoder, self).__init__()

    self.input_shape = input_shape
    self.code_length = code_length

    c, h, w = input_shape

    print (c,h,w)

    activation_fn = nn.LeakyReLU()

    # Convolutional network
    self.conv = nn.Sequential(
        nn.Conv2d(in_channels=c, out_channels=32, kernel_size=3, bias=False),
        activation_fn,
        ResidualBlock(channel_in=32, channel_out=32, activation_fn=activation_fn),
        DownsampleBlock(channel_in=32, channel_out=64, activation_fn=activation_fn),
        DownsampleBlock(channel_in=64, channel_out=128, activation_fn=activation_fn),
        DownsampleBlock(channel_in=128, channel_out=256, activation_fn=activation_fn),
    )
    self.deepest_shape = (256, h // 8, w // 8)

    # FC network
    self.fc = nn.Sequential(
        nn.Linear(in_features=reduce(mul, self.deepest_shape), out_features=256),
        nn.BatchNorm1d(num_features=256),
        activation_fn,
        nn.Linear(in_features=256, out_features=code_length),
        nn.Sigmoid()
    )

def forward(self, x):
    # types: (torch.Tensor) -> torch.Tensor
    """
    Forward propagation.

    :param x: the input batch of images.
    :return: the batch of latent vectors.
    """
    h = x
    print (type(h))
    h = self.conv(h)
    h = h.view(len(h), -1)
    o = self.fc(h)

    return o

class Decoder(BaseModule):
"""
CIFAR10 model decoder.
"""
def init(self, code_length, deepest_shape, output_shape):
# type: (int, Tuple[int, int, int], Tuple[int, int, int]) -> None
"""
Class constructor.

    :param code_length: the dimensionality of latent vectors.
    :param deepest_shape: the dimensionality of the encoder's deepest convolutional map.
    :param output_shape: the shape of CIFAR10 samples.
    """
    super(Decoder, self).__init__()

    self.code_length = code_length
    self.deepest_shape = deepest_shape
    self.output_shape = output_shape

    print (self.output_shape,"--")

    activation_fn = nn.LeakyReLU()

    # FC network
    self.fc = nn.Sequential(
        nn.Linear(in_features=code_length, out_features=256),
        nn.BatchNorm1d(num_features=256),
        activation_fn,
        nn.Linear(in_features=256, out_features=reduce(mul, deepest_shape)),
        nn.BatchNorm1d(num_features=reduce(mul, deepest_shape)),
        activation_fn
    )

    # Convolutional network
    self.conv = nn.Sequential(
        UpsampleBlock(channel_in=256, channel_out=128, activation_fn=activation_fn),
        UpsampleBlock(channel_in=128, channel_out=64, activation_fn=activation_fn),
        UpsampleBlock(channel_in=64, channel_out=32, activation_fn=activation_fn),
        ResidualBlock(channel_in=32, channel_out=32, activation_fn=activation_fn),
        nn.Conv2d(in_channels=32, out_channels=3, kernel_size=1, bias=False)
    )

def forward(self, x):
    # types: (torch.Tensor) -> torch.Tensor
    """
    Forward propagation.

    :param x: the batch of latent vectors.
    :return: the batch of reconstructions.
    """
    h = x
    h = self.fc(h)
    h = h.view(len(h), *self.deepest_shape)
    h = self.conv(h)
    o = h

    return o

class LSACIFAR10(BaseModule):
"""
LSA model for CIFAR10 one-class classification.
"""
def init(self, input_shape, code_length, cpd_channels):
# type: (Tuple[int, int, int], int, int) -> None
"""
Class constructor.

    :param input_shape: the shape of CIFAR10 samples.
    :param code_length: the dimensionality of latent vectors.
    :param cpd_channels: number of bins in which the multinomial works.
    """
    super(LSACIFAR10, self).__init__()

    self.input_shape = input_shape
    self.code_length = code_length

    # Build encoder
    self.encoder = Encoder(
        input_shape=input_shape,
        code_length=code_length
    )

    # Build decoder
    self.decoder = Decoder(
        code_length=code_length,
        deepest_shape=self.encoder.deepest_shape,
        output_shape=input_shape
    )

    # Build estimator
    self.estimator = Estimator1D(
        code_length=code_length,
        fm_list=[32, 32, 32, 32],
        cpd_channels=cpd_channels
    )

def forward(self, x):
    # type: (torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
    """
    Forward propagation.

    :param x: the input batch of images.
    :return: a tuple of torch.Tensors holding reconstructions, latent vectors and CPD estimates.
    """
    h = x

    # Produce representations
    z = self.encoder(h)

    # Estimate CPDs with autoregression
    z_dist = self.estimator(z)

    # Reconstruct x
    x_r = self.decoder(z)
    # print (x_r.shape)
    x_r = x_r.view(-1, *self.input_shape)

    return x_r, z, z_dist

def load_dataset(data_path="/home/jbmai/Downloads/Defect Images-20190705T133320Z-001"):
# data_path = 'data/train/'

torchvision.transforms.Grayscale(num_output_channels=1)

trainTransform  = torchvision.transforms.Compose([
                                torchvision.transforms.Resize(size=(128,128), interpolation=2),
                                torchvision.transforms.ToTensor(), 
                                ])




train_dataset = torchvision.datasets.ImageFolder(
    root=data_path,
    transform=trainTransform)


train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=64,
    num_workers=0,
    shuffle=True
)
return train_loader

net = LSACIFAR10(input_shape=[3,128,128],code_length = 32,cpd_channels =100)
lossFunction = LSALoss(cpd_channels=100)
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

try:

checkpoint = torch.load("savedWeights/enc.pth")
net.encoder.load_state_dict(checkpoint)

checkpoint = torch.load("savedWeights/est.pth")
net.estimator.load_state_dict(checkpoint)

checkpoint = torch.load("savedWeights/dec.pth")
net.decoder.load_state_dict(checkpoint)

except Exception as e:
print (e)

for epoch in range(1000): # loop over the dataset multiple times

running_loss = 0.0
d = load_dataset()
for i, (data,l) in enumerate(d):
    # get the inputs; data is a list of [inputs, labels]
    
    # print (data.shape)

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    # print (data)
    x_r,z,z_dist = net.forward(data)
    # print (x_r.shape)
    # print(data.shape)
    loss = lossFunction(data,x_r,z,z_dist)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()
    if i % 5 == 0:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
              (epoch + 1, i + 1, running_loss / 5))
        running_loss = 0.0
if (epoch % 5)== 0 :
    # print ("--------------------{} epoch-----------".format(epoch))
    # net.encoder.eval()
    # net.estimator.eval()
    # net.decoder.eval()

    # z = net.encoder(data)
    # z_dist = net.estimator(z)
    # x_r = net.decoder(z).permute(0,2,3,1).detach().numpy()
    # out =x_r
    # print (type(out))
    # for i in range(out.shape[0]):
    #     # print (out.shape)
    #     # # out.permute(0,2,3,1)
    #     # print (out.shape)
    #     cv2.imwrite("constructedImages/outDec{}_{}.jpg".format(epoch,i),out[i,:,:,:]*255)
    #     # cv2.waitKey(0)
       
    #     # cv2.
    # net.encoder.train()
    # net.estimator.train()
    # net.decoder.eval()
    torch.save(net.encoder.state_dict(),("savedWeights/enc.pth"))
    torch.save(net.estimator.state_dict(),("savedWeights/est.pth"))
    torch.save(net.decoder.state_dict(),("savedWeights/dec.pth"))

print('Finished Training')`

Output :

<class 'torch.Tensor'>
[1, 1] loss: 727109273.600
<class 'torch.Tensor'>
[2, 1] loss: 2495627954514337382531072.000

Hi can you help me rectify the issue.

@learnermaxRL
Copy link
Author

Managed to correct above by changing hyperparameters,but after few steps i get
[1, 71] loss: 3293.943 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 76] loss: 3096.135 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 81] loss: 3161.412 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 86] loss: 3176.714 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 91] loss: 2969.099 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 96] loss: 3247.497 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 101] loss: 3068.246 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 106] loss: 2769.233 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 111] loss: 2766.316 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 116] loss: 2718.537 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 121] loss: 3055.004 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 126] loss: 2576.473 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 131] loss: 2947.091 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 136] loss: 2869.674 <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> [1, 141] loss: 2717.064 <class 'torch.Tensor'> Traceback (most recent call last): File "modelTrainer.py", line 261, in <module> x_r,z,z_dist = net.forward(data) File "modelTrainer.py", line 188, in forward z = self.encoder(h) File "/home/jbmai/DefectsDetection/NoveltyDetection/models/base.py", line 33, in __call__ return super(BaseModule, self).__call__(*args, **kwargs) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "modelTrainer.py", line 73, in forward o = self.fc(h) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward input = module(input) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 83, in forward exponential_average_factor, self.eps) File "/home/jbmai/anaconda3/envs/torchenv/lib/python3.7/site-packages/torch/nn/functional.py", line 1693, in batch_norm raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size)) ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 256])
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 256])

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

Hi @learnermaxRL

I would advise using Adam for optimization.

As for the latter error you reported, it is likely to be due to a singleton batch (i.e., a batch with only one sample). It may be due to the DataLoader, try setting the drop_last flag to True.

D

@learnermaxRL
Copy link
Author

learnermaxRL commented Jul 18, 2019

Thanks,corrected it,however i can see that reconstructed images have negative values in the tensor,is it desirable?

sample slice of x_r *255 = [ 19.986425 33.786083 109.08704 ]]

[[ 49.809772 -32.651962 -1.5576267]
[ 53.66301 -72.02914 48.711018 ]
[ 39.252117 -81.27754 75.4854 ]
...
[ 21.410696 -71.10042 68.18309 ]
[ -8.615957 -179.66095 8.810505 ]
[ 44.986786 29.80011 93.024506 ]]

[[ -68.59759 64.74513 51.421898 ]
[ -18.552599 29.491028 69.56346 ]
[ -49.379646 45.368095 29.86158 ]
...
[ -84.07668 41.966274 100.4433 ]
[ 1.8273218 58.350666 60.632793 ]

If yes than how to get back RGB image from them?

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

Negative values are not undesirable per se, as long as it is a concious choice.
What is the range of your input images?

D

@learnermaxRL
Copy link
Author

input images are RGB images with standard 0-255 range being fed through dataloader pytorch
`def load_dataset(data_path="/home/ji/Downloads/aug/"):
# data_path = 'data/train/'

torchvision.transforms.Grayscale(num_output_channels=1)

trainTransform  = torchvision.transforms.Compose([
                                torchvision.transforms.Resize(size=(128,128), interpolation=2),
                                torchvision.transforms.ToTensor(), 
                                ])




train_dataset = torchvision.datasets.ImageFolder(
    root=data_path,
    transform=trainTransform)


train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=16,
    num_workers=0,
    shuffle=True,
    drop_last=True
    # pin_memory=True
)
return train_loader

`

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

I would advise standardizing input images.
E.g., try making each channel zero-mean and unit std.

D

@learnermaxRL
Copy link
Author

that makes sense,however standardization wouldnt guarantee negative output right,?

@learnermaxRL
Copy link
Author

Also i checked my input data is between 0 and 1 already,

Pytorch default backend 1 for images are Pillow, and when you use ToTensor() 2 class, PyTorch automatically converts all images into [0,1].

So shouldn't the reconstruction output be standardized before calculating reconstruction loss or perhaps use another activation function?

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

I would advise using a linear activation function for the reconstruction and providing the groundtruth image to the same loss in the same range as the input.

@learnermaxRL
Copy link
Author

Are you talking about the x_r from decoder,should i use sigmoid on top of last layer in decoder output,, i am trying to train the model to learn the reconstruction of the image itself,so GT will be the same image
x_r,z,z_dist = net.forward(data) # print (x_r.shape) # print(data.shape) loss = lossFunction.forward(data,x_r,z,z_dist) loss.backward() optimizer.step()
Is there anything else I need to do?

@learnermaxRL
Copy link
Author

net = LSACIFAR10(input_shape=[3,128,128],code_length = 64,cpd_channels =100) lossFunction = LSALoss(cpd_channels=100)

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

The sigmoid is not mandatory. I would advise not to use it.

The rest of the code seems fine. Would an [0-1] input deliver those reconstructions?

D

@learnermaxRL
Copy link
Author

Sorry didnt understand that part,my input is already [0,1],its the reconstruction which has negative values

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

Does the reconstruction loss go down?

@learnermaxRL
Copy link
Author

How have you done the reconstruction of image,as in what was your input range in cifar images for instance and what was the activation function of decoder and range of reconstructed output?

@learnermaxRL
Copy link
Author

It has high values ~2500,since i am using sigmoid the learning is pretty slow,i suspect the gradients are very less due to higher output before sigmoid.After 100 epochs on dataset of size 1400 with batch size 64 the loss only decrease by approx 50 units.

Can you guide me as to what have you done

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

  1. range of CIFAR-10 images: [0-1]
  2. decoder activation function: None
  3. range of the reconstruction: approximately [0-1]

@learnermaxRL
Copy link
Author

I see ,but how come the values are negative in my case,i mean weights arent negative,my input [0,1] no non linearity applied on top of your code,what is going wrong here?

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

I saw you are using 128x128 images. The number of downsampling in the model is tuned on 32x32 images. You might then have a huge linear layer before the bottleneck of the autoencoder, with many parameters slowing down learning.

As for the negative values, it is really weird. I would try optimizing the reconstruction loss only (plain autoencoder) and see if the problem fades.

@learnermaxRL
Copy link
Author

Thanks,I ll let you know the progress,let me lighten up the model a bit.. :)

@DavideA
Copy link
Contributor

DavideA commented Jul 18, 2019

Keep in mind that, if I am right, the best way to lighten up the model would be to add downsample and upsample blocks.

@learnermaxRL
Copy link
Author

learnermaxRL commented Jul 19, 2019

Yeah sure will do,but how do I deal with reconstruction with negative values,using sigmoid is the last option I would prefer.Any help?

Did you encounter negative values while reconstruction ,if yes how did you deal with it,(apart from using activation),is standardizing the x_r right at inference time/validation time?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants