In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt

import torch
import torchvision

from torchvision import datasets, transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time

train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

CUDA is available!  Training on GPU ...


### Load the data
Transform the data to the required size of the Xception model and also normalize it

In [3]:
test_datadir="../input/100-bird-species/birds/test"
train_datadir="../input/100-bird-species/birds/train"
valid_datadir="../input/100-bird-species/birds/valid"

transform=transforms.Compose([
    transforms.Resize([299,299]),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_data=datasets.ImageFolder(train_datadir,transform)
test_data=datasets.ImageFolder(test_datadir,transform)
valid_data=datasets.ImageFolder(valid_datadir,transform)

batchsize=24

train_dataloader=torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True)
test_dataloader=torch.utils.data.DataLoader(test_data, batch_size=batchsize, shuffle=True)
valid_dataloader=torch.utils.data.DataLoader(valid_data, batch_size=batchsize, shuffle=False)

In [None]:
# Example display - 

img,label=next(iter(train_dataloader))
img = img.numpy()
img = img / 2 + 0.5 
plt.imshow(np.transpose(img[1], (2, 1, 0)))
print(label[1])

### Below we have used the Xception model - 

[--> Xception Paper link <--](https://paperswithcode.com/paper/xception-deep-learning-with-depthwise-1)

In [4]:
class EntryFlowLoop(nn.Module):
    def __init__(self,in_channels,out_channels,relu_start):
        super(EntryFlowLoop,self).__init__()
        self.relu_start=relu_start
        
        self.conv=nn.Conv2d(in_channels,out_channels,1,stride=2)
        self.bnc=nn.BatchNorm2d(out_channels)
        
        self.sc1a=nn.Conv2d(in_channels,in_channels,3,groups=in_channels,padding=1,bias=False)
        self.sc1b=nn.Conv2d(in_channels,out_channels,1,bias=False)
        self.bn1=nn.BatchNorm2d(out_channels)
        
        self.sc2a=nn.Conv2d(out_channels,out_channels,3,groups=out_channels,padding=1,bias=False)
        self.sc2b=nn.Conv2d(out_channels,out_channels,1,bias=False)
        self.bn2=nn.BatchNorm2d(out_channels)
        
        self.maxpool=nn.MaxPool2d(kernel_size=(3,3),stride=2,padding=1)
        self.relu=nn.ReLU()
    
    def forward(self,x):
                
        output=self.conv(x)
        output=self.bnc(output)
        
        if self.relu_start==True:
            x=self.relu(x)
        
        x=self.sc1b(self.sc1a(x))
        x=self.bn1(x)
        x=self.relu(x)
        
        x=self.sc2b(self.sc2a(x))
        x=self.bn2(x)
        x=self.maxpool(x)
        
        x+=output
        
        return x


class EntryFlowFinal(nn.Module):
    def __init__(self):
        super(EntryFlowFinal,self).__init__()
        
        self.c1=nn.Conv2d(3,32,3,stride=2)
        self.bn1=nn.BatchNorm2d(32)

        self.c2=nn.Conv2d(32,64,3)
        self.bn2=nn.BatchNorm2d(64)
        
        self.loop1=EntryFlowLoop(64,128,False)
        self.loop2=EntryFlowLoop(128,256,True)
        self.loop3=EntryFlowLoop(256,728,True)
        
        self.relu=nn.ReLU()
    
    def forward(self,x):
        x=self.c1(x)
        x=self.bn1(x)
        x=self.relu(x)
        
        x=self.c2(x)
        x=self.bn2(x)
        x=self.relu(x)
        
        x=self.loop1(x)
        x=self.loop2(x)
        x=self.loop3(x)
        
        return x

In [5]:
class MiddleFlow(nn.Module):
    def __init__(self,channels=728):
        super(MiddleFlow,self).__init__()
        
        self.sc1a=nn.Conv2d(728,728,3,groups=728,padding=1,bias=False)
        self.sc1b=nn.Conv2d(728,728,1,bias=False)
        
        self.sc2a=nn.Conv2d(728,728,3,groups=728,padding=1,bias=False)
        self.sc2b=nn.Conv2d(728,728,1,bias=False)
        
        self.sc3a=nn.Conv2d(728,728,3,groups=728,padding=1,bias=False)
        self.sc3b=nn.Conv2d(728,728,1,bias=False)
        
        
        self.bn1=nn.BatchNorm2d(728)
        self.bn2=nn.BatchNorm2d(728)
        self.bn3=nn.BatchNorm2d(728)
        self.relu=nn.ReLU()
        
    def forward(self,x):
        output=x
        
        x=self.sc1b(self.sc1a(self.relu(x)))
        x=self.bn1(x)
        x=self.sc2b(self.sc2a(self.relu(x)))
        x=self.bn2(x)
        x=self.sc3b(self.sc3a(self.relu(x)))
        x=self.bn3(x)
        
        x=x+output
        
        return x

    
class MiddleFlowFinal(nn.Module):
    def __init__(self):
        super(MiddleFlowFinal,self).__init__()
        self.cycle1=MiddleFlow()
        self.cycle2=MiddleFlow()
        self.cycle3=MiddleFlow()
        self.cycle4=MiddleFlow()
        self.cycle5=MiddleFlow()
        self.cycle6=MiddleFlow()
        self.cycle7=MiddleFlow()
        self.cycle8=MiddleFlow()
    
    
    
    def forward(self,x):
#         for i in range(8):
#             x=self.cycle(x)
        x=self.cycle1(x)
        x=self.cycle2(x)
        x=self.cycle3(x)
        x=self.cycle4(x)
        x=self.cycle5(x)
        x=self.cycle6(x)
        x=self.cycle7(x)
        x=self.cycle8(x)
        
        return x

In [6]:
class ExitFlowFinal(nn.Module):
    def __init__(self):
        super(ExitFlowFinal,self).__init__()
        
        self.c1=nn.Conv2d(728,1024,1,stride=2)
        self.bnc=nn.BatchNorm2d(1024)
        
        self.sc1a=nn.Conv2d(728,728,3,groups=728,padding=1,bias=False)
        self.sc1b=nn.Conv2d(728,728,1,bias=False)
        self.bn1=nn.BatchNorm2d(728)
        
        self.sc2a=nn.Conv2d(728,728,3,groups=728,padding=1,bias=False)
        self.sc2b=nn.Conv2d(728,1024,1,bias=False)
        self.bn2=nn.BatchNorm2d(1024)

        self.sc3a=nn.Conv2d(1024,1024,3,groups=1024,padding=1,bias=False)
        self.sc3b=nn.Conv2d(1024,1536,1,bias=False)
        self.bn3=nn.BatchNorm2d(1536)

        self.sc4a=nn.Conv2d(1536,1536,3,groups=1536,padding=1,bias=False)
        self.sc4b=nn.Conv2d(1536,2048,1,bias=False)
        self.bn4=nn.BatchNorm2d(2048)
        
        self.maxpool=nn.MaxPool2d(kernel_size=(3,3),stride=2,padding=1)
        self.relu=nn.ReLU()
        self.avgpool=nn.AvgPool2d(kernel_size=(10,10))
        
    def forward(self,x):
        output=self.c1(x)
        output=self.bnc(output)
        
        x=self.relu(x)
        x=self.sc1b(self.sc1a(x))
        x=self.bn1(x)
        
        x=self.relu(x)
        x=self.sc2b(self.sc2a(x))
        x=self.bn2(x)
        
        x=self.maxpool(x)
        x+=output
        
        x=self.sc3b(self.sc3a(x))
        x=self.bn3(x)        
        x=self.relu(x)
        
        x=self.sc4b(self.sc4a(x))
        x=self.bn4(x)        
        x=self.relu(x)
        
        x=self.avgpool(x)
        
        return x   

In [7]:
class Xception_implementation(nn.Module):
    def __init__(self):
        super(Xception_implementation,self).__init__()       
        
        self.entry=EntryFlowFinal()
        self.middle=MiddleFlowFinal()
        self.exit=ExitFlowFinal()
        
        self.fc1=nn.Linear(2048,512)
        self.ol=nn.Linear(512,275)
        
        self.relu=nn.ReLU()

    def forward(self,x):
        
        x=self.entry(x)
        x=self.middle(x)
        x=self.exit(x)
        
        x = x.view(x.size(0), -1) 
        x=self.relu(self.fc1(x))
        x=self.relu(self.ol(x))
        return x


model = Xception_implementation()
#print(model)

# move tensors to GPU if CUDA is available
if train_on_gpu:
    model.cuda()
! pip install torch-summary
from torchsummary import summary
summary(model,(3,299,299))

Collecting torch-summary
  Downloading torch_summary-1.4.5-py3-none-any.whl (16 kB)
Installing collected packages: torch-summary
Successfully installed torch-summary-1.4.5
Layer (type:depth-idx)                   Output Shape              Param #
├─EntryFlowFinal: 1-1                    [-1, 728, 19, 19]         --
|    └─Conv2d: 2-1                       [-1, 32, 149, 149]        896
|    └─BatchNorm2d: 2-2                  [-1, 32, 149, 149]        64
|    └─ReLU: 2-3                         [-1, 32, 149, 149]        --
|    └─Conv2d: 2-4                       [-1, 64, 147, 147]        18,496
|    └─BatchNorm2d: 2-5                  [-1, 64, 147, 147]        128
|    └─ReLU: 2-6                         [-1, 64, 147, 147]        --
|    └─EntryFlowLoop: 2-7                [-1, 128, 74, 74]         --
|    |    └─Conv2d: 3-1                  [-1, 128, 74, 74]         8,320
|    |    └─BatchNorm2d: 3-2             [-1, 128, 74, 74]         256
|    |    └─Conv2d: 3-3                  [-

Layer (type:depth-idx)                   Output Shape              Param #
├─EntryFlowFinal: 1-1                    [-1, 728, 19, 19]         --
|    └─Conv2d: 2-1                       [-1, 32, 149, 149]        896
|    └─BatchNorm2d: 2-2                  [-1, 32, 149, 149]        64
|    └─ReLU: 2-3                         [-1, 32, 149, 149]        --
|    └─Conv2d: 2-4                       [-1, 64, 147, 147]        18,496
|    └─BatchNorm2d: 2-5                  [-1, 64, 147, 147]        128
|    └─ReLU: 2-6                         [-1, 64, 147, 147]        --
|    └─EntryFlowLoop: 2-7                [-1, 128, 74, 74]         --
|    |    └─Conv2d: 3-1                  [-1, 128, 74, 74]         8,320
|    |    └─BatchNorm2d: 3-2             [-1, 128, 74, 74]         256
|    |    └─Conv2d: 3-3                  [-1, 64, 147, 147]        576
|    |    └─Conv2d: 3-4                  [-1, 128, 147, 147]       8,192
|    |    └─BatchNorm2d: 3-5             [-1, 128, 147, 147]       256


In [8]:
import torch.optim as optim

# specify loss function
criterion = nn.CrossEntropyLoss()

# specify optimizer
optimizer = optim.SGD(model.parameters(),lr=0.03)
#optimizer = optim.Adam(model.parameters(),lr=0.00005)

In [None]:
n_epochs = 10 
valid_loss_min = np.Inf # track change in validation loss
batch_number=0
for epoch in range(1, n_epochs+1):

    # keep track of training and validation loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train()
    
    for data, target in train_dataloader:
        batch_number+=1
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss

        
        loss = criterion(output, target)

        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update training loss
        train_loss += loss.item()*data.size(0)
        
        if batch_number%100==0:
            print(batch_number)
            #print(train_loss/(24*batch_number))
    ######################    
    # validate the model #
    ######################
    model.eval()
    for data, target in valid_dataloader:
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # update average validation loss 
        valid_loss += loss.item()*data.size(0)
    
    # calculate average losses
    train_loss = train_loss/len(train_dataloader.dataset)
    valid_loss = valid_loss/len(valid_dataloader.dataset)
        
    # print training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model_bird_species.pt')
        valid_loss_min = valid_loss

## Adjusting different parameters - 
We are using Xception paper model and therefore the hyper-parameters (number of layers and their size) is fixed. Therefore we are left with the following parameters- 
1. Optimizer- We will try 2 -  Adam and SGD (with momentum)
2. lr - We will try different ranges of lr to get to most suitable value
3. batch-size - Though I have a little problem of memory issues and theefore I cannot try a large range of Batch-size, but still I try to achieve the sweet spot.
4. Logistic NN after the Xception part of the Neural Network

#### For lr=0.01, optimizer= Adam, batch_size=24 -
100  
5.68693591594696  
200  
5.6518528532981875  
300  
5.640158499081929  
400  
5.634311321973801  
500  
5.630803015708923  

##### Conclusion:  
The values are reaching the limit very early, therefore its possible that our ls is very large due to which its just oscillating around a local minima or its very small that its not moving significantly.  
We will first try to increase the lr.
  

#### For lr=0.03, optimizer= Adam, batch_size=24 -

100  
6.984280848503113  
200  
6.300525319576264  
300  
6.072606809933981  
400  
5.958647555112838  
500  
5.890272002220154  

##### Conclusion:  
Though its a bad performance campared to the previous test, therefore we conclude that our lr is very large for the data. But, to double confirm this, we will run it one more time with larger lr.
  
  
#### For lr=0.09, optimizer= Adam, batch_size=24 -
100  
17.518945121765135  
200  
11.567857458591462  
300  
9.584161569277445   

##### Conclusion:  
Our caonclusion is confirmed and now we will run many different tests, decreasing the lr slowly, till we reach a sweet spot.


#### For lr=0.005, optimizer= Adam, batch_size=24 -

100  
5.628419208526611  
200  
5.622594499588013  
300  
5.620652929941813  
400  
5.619682145118714  
500  
5.619099674224853  
600  
5.618711360295614  
700  
5.6184339932033  
800  
5.618225967884063  
900  
5.618064170413547  
1000  
5.617934732437134  

##### Conclusion:  
We are seeing improvements.  
Lets keep decreasing the lr.
  
#### For lr=0.001, optimizer= Adam, batch_size=24 -

100  
5.596344637870788  
200  
5.5770020365715025  
300  
5.557831103006999  
400  
5.545575157403946  
500  
5.537669966697693  


##### Conclusion:  
We are seeing improvements.  
Lets keep decreasing the lr.

#### For lr=0.0005, optimizer= Adam, batch_size=24 -

100  
5.615420479774475  
200  
5.6130558156967165  
300  
5.613888082504272  
400  
5.613553690910339  
500  
5.612753263473511   

##### Conclusion:  
Its a rough spot. Maybe lr=0.001 was better. But its almost the same results, and we should try decreasing the lr again.

#### For lr=0.0001, optimizer= Adam, batch_size=24 -
100  
5.594290690422058  
200  
5.586528244018555  
300  
5.56579353650411  
400  
5.5533226096630095  
500  
5.540951251029968  
600  
5.531232864061991  

##### Conclusion:  
Definately a big improvement. Also notice that the decrease in loss with 100 batches is also increasing.  
This means we are on right track and lets keep reducing the lr.

#### For lr=0.00005, optimizer= Adam, batch_size=24 -

100  
5.604961247444153  
200  
5.578570282459259  
300  
5.552625319163004  
400  
5.532134653329849  
500  
5.508835887908935  

##### Conclusion:  
We are seeing improvements.  
Lets keep decreasing the lr.

#### For lr=0.00001, optimizer= Adam, batch_size=24 -

100  
5.611733942031861  
200  
5.603807361125946  
300  
5.59537739276886  
400  
5.585271533727646  
500  
5.574601455688477  
600  
5.563503699302673   

##### Conclusion:  
Though not a bad performance, but lr=0.00005 was better than 0.00001 and therefore we conclude that the reducing lr further is not good.

## Therefore Lets fix lr at 0.00005  

#### Now lets focus on Logistic NN part just after the Xception layers. Till now, we were using 2 layers - (2048,512,275)  

#### For lr=0.00005, optimizer= Adam, batch_size=24  and NN ( 1 layer )-
100  
5.596523351669312  
200  
5.574898626804352  
300  
5.565456638336181  
400  
5.554475079774857  
500  
5.537313059806824  

##### Conclusion:  
Performance has definately gone down.  
Instead of decreasing, lets try to increase the number of layers.

#### For lr=0.00005, optimizer= Adam, batch_size=24  and NN ( 3 layers - 2048,1024,512,275 )-

100  
5.606496725082398  
200  
5.5884935927391055  
300  
5.564859512646993  
400  
5.549061695337295  
500  
5.530344123840332  
600  
5.517454077402751  

##### Conclusion:  
Nope.  
The performance has again gone down.  
2 layered network was better.  

## Therefore, lets keep the NN as 2 layered - 2048,512,275

#### Now lets focus on the optimizer and try SGD with different lr.  
#### For lr=0.00005, optimizer= SGD, batch_size=24 -
100
5.618415269851685
200
5.617383131980896
300
5.617286367416382
400
5.617484185695648
500
5.61704674911499  

##### Conclusion:  
Not a good performance compared to Adam.  
Maybe changing the lr will help.  
Lets increase the lr.

#### For lr=0.0001, optimizer= SGD, batch_size=24 -
100  
5.620724911689758  
200  
5.619467494487762  
300  
5.617840498288473  
400  
5.617654368877411  
500  
5.6167002849578855  
600  
5.615881765683492  

##### Conclusion:  
Almost the same performance as the previous one.  
Lets increse the lr more and compare the results.

#### For lr=0.005, optimizer= SGD, batch_size=24 -
100  
5.614914889335632  
200  
5.611899130344391  
300  
5.610926251411438  
400  
5.609055856466293  
500  
5.606855712890625  

##### Conclusion:  
Though its an improvement.  
But we can see that the change in loss in very less and therefore in a few more steps it will change negligibly.  

#### For lr=0.03, optimizer= SGD, batch_size=24 -

100  
5.604961247444153  
200  
5.598570282459259  
300  
5.582625319163004  
400  
5.572134653329849  
500  
5.568835887908935  

##### Conclusion:  
Though its not as good as the Adam 0.00005 but its close.  
Also, this lr is very large compared to 0.00005 this will help us with faster convergence.  
Though I claim that on running a lot of epochs, Adam model will give us better accuracy but, this model will be more praticle.    



## Therefore, its better if we use SGD with lr=0.03.  


NOTE : I tried changing the batch-size but it had almost no effect on values, moreover, increasing it to more than 24 is giving Memory issues and therefore, we will remain with 24 as the batch-size.  
Therefore, the final choosen values are-  
1. lr = 0.00005
2. Logictic NN after Xception - 2 layers (2048,512,275)
3. Optimizer- Adam
4. Batch-size = 24

In [None]:
torch.save(model.state_dict(), 'model_bird_species_2.pt')

### Testing 
We have trained and saved 2 models -
1. model_bird_species - is the model where we are getting the min Validation loss ( but high training loss)
2. model_bird_species_2 - is the model where are fitting the data efficiently to the training data but have high loss in validation data  

Choose 1 saved model in the below code to see the what accuracy we are getting on each model.

In [None]:
# Choose 1 from the below 2-
model.load_state_dict(torch.load('../input/saved-model-bird-species/model_bird_species.pt'))
#model.load_state_dict(torch.load('../input/saved-model-bird-species/model_bird_species_2.pt'))

test_loss = 0.0
class_correct = list(0. for i in range(275))
class_total = list(0. for i in range(275))

model.eval()
# iterate over test data
for data, target in test_dataloader:
    
    if train_on_gpu:
        data, target = data.cuda(), target.cuda()
    
    output = model(data)    
    loss = criterion(output, target)
    
    test_loss += loss.item()*data.size(0)
    
    _, pred = torch.max(output, 1)    
    # compare predictions to true label
    correct_tensor = pred.eq(target.data.view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    # calculate test accuracy for each object class
    batchsize=data.size(0)
    for i in range(batchsize):
        label = target.data[i]
        class_correct[label] += correct[i].item()
        class_total[label] += 1

# average test loss
test_loss = test_loss/len(test_dataloader.dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))

for i in range(275):
    if class_total[i] > 0:
        print('Test Accuracy of class %5s: %2d%% (%2d/%2d)' % (
            i, 100 * class_correct[i] / class_total[i],
            np.sum(class_correct[i]), np.sum(class_total[i])))
    else:
        print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i]))

print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
    100. * np.sum(class_correct) / np.sum(class_total),
    np.sum(class_correct), np.sum(class_total)))

### Observation -
We saw that we reached the best possible validation loss in the first few epochs and the trainig loss kept on decreasing.  
Which is not very good as we want to reduce the validation loss and not necessarily the training loss.  

Maybe by randomising and making the training data more general, we may get a better validation loss (at the cost of training loss)  

Therefore, we add randomisation to our training data -

In [9]:
transform_2 = transforms.Compose([transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(299),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.5, 0.5, 0.5], 
                                                            [0.5, 0.5, 0.5])])

train_data_2=datasets.ImageFolder(train_datadir,transform_2)

train_dataloader_2=torch.utils.data.DataLoader(train_data_2, batch_size=batchsize, shuffle=True)


### Now lets train the new data again

In [None]:
n_epochs = 100 
valid_loss_min = np.Inf # track change in validation loss
batch_number=0
for epoch in range(1, n_epochs+1):

    # keep track of training and validation loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train()
    
    for data, target in train_dataloader_2:
        batch_number+=1
        
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss

        
        loss = criterion(output, target)

        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update training loss
        train_loss += loss.item()*data.size(0)
        
        if batch_number%100==0:
            print(batch_number)
            #print(train_loss/(24*batch_number))
    ######################    
    # validate the model #
    ######################
    model.eval()
    for data, target in valid_dataloader:
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # update average validation loss 
        valid_loss += loss.item()*data.size(0)
    
    # calculate average losses
    train_loss = train_loss/len(train_dataloader_2.dataset)
    valid_loss = valid_loss/len(valid_dataloader.dataset)
        
    # print training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model_bird_species_3.pt')
        valid_loss_min = valid_loss

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
Epoch: 1 	Training Loss: 5.324159 	Validation Loss: 5.140066
Validation loss decreased (inf --> 5.140066).  Saving model ...
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
Epoch: 2 	Training Loss: 4.754307 	Validation Loss: 4.985553
Validation loss decreased (5.140066 --> 4.985553).  Saving model ...
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
Epoch: 3 	Training Loss: 4.329358 	Validation Loss: 4.318512
Validation loss decreased (4.985553 --> 4.318512).  Saving model ...
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
Epoch: 4 	Training Loss: 3.881392 	Validation Loss: 3.613247
Validation loss decreased (4.318512 --> 3.613247).  Saving model ...
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
Epoch: 5 	Training Loss: 3.501989 	Validation Loss: 3.935021
8300
8400
8500
8600
870

In [None]:
torch.save(model.state_dict(), 'model_bird_species_4.pt')

### Testing 
We have trained and saved 2 more models -  
1. model_bird_species_3 - is the model where we are getting the min Validation loss ( but high training loss) on new training data.  
2. model_bird_species_4 - is the model where are fitting the data efficiently to the training data but have high loss in validation data. (on new training data)  

Choose 1 saved model in the below code to see the what accuracy we are getting on each model.

In [11]:
# Choose one of the four available saved models- 
#model.load_state_dict(torch.load('../input/saved-model-bird-species/model_bird_species.pt'))
# model.load_state_dict(torch.load('../input/saved-model-bird-species/model_bird_species_2.pt'))
model.load_state_dict(torch.load('../input/saved-model-bird-species/model_bird_species_3.pt'))
# model.load_state_dict(torch.load('../input/saved-model-bird-species/model_bird_species_4.pt'))

test_loss = 0.0
class_correct = list(0. for i in range(275))
class_total = list(0. for i in range(275))

model.eval()
# iterate over test data
for data, target in test_dataloader:
    
    if train_on_gpu:
        data, target = data.cuda(), target.cuda()
    
    output = model(data)    
    loss = criterion(output, target)
    
    test_loss += loss.item()*data.size(0)
    
    _, pred = torch.max(output, 1)    
    # compare predictions to true label
    correct_tensor = pred.eq(target.data.view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    # calculate test accuracy for each object class
    batchsize=data.size(0)
    for i in range(batchsize):
        label = target.data[i]
        class_correct[label] += correct[i].item()
        class_total[label] += 1

# average test loss
test_loss = test_loss/len(test_dataloader.dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))

for i in range(275):
    if class_total[i] > 0:
        print('Test Accuracy of class %5s: %2d%% (%2d/%2d)' % (
            i, 100 * class_correct[i] / class_total[i],
            np.sum(class_correct[i]), np.sum(class_total[i])))
    else:
        print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i]))

print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
    100. * np.sum(class_correct) / np.sum(class_total),
    np.sum(class_correct), np.sum(class_total)))

Test Loss: 0.587593

Test Accuracy of class     0: 100% ( 5/ 5)
Test Accuracy of class     1: 100% ( 5/ 5)
Test Accuracy of class     2: 100% ( 5/ 5)
Test Accuracy of class     3: 80% ( 4/ 5)
Test Accuracy of class     4: 80% ( 4/ 5)
Test Accuracy of class     5: 80% ( 4/ 5)
Test Accuracy of class     6: 80% ( 4/ 5)
Test Accuracy of class     7: 100% ( 5/ 5)
Test Accuracy of class     8: 100% ( 5/ 5)
Test Accuracy of class     9: 100% ( 5/ 5)
Test Accuracy of class    10: 100% ( 5/ 5)
Test Accuracy of class    11: 100% ( 5/ 5)
Test Accuracy of class    12: 80% ( 4/ 5)
Test Accuracy of class    13: 100% ( 5/ 5)
Test Accuracy of class    14: 100% ( 5/ 5)
Test Accuracy of class    15: 100% ( 5/ 5)
Test Accuracy of class    16: 100% ( 5/ 5)
Test Accuracy of class    17: 100% ( 5/ 5)
Test Accuracy of class    18: 100% ( 5/ 5)
Test Accuracy of class    19: 100% ( 5/ 5)
Test Accuracy of class    20: 100% ( 5/ 5)
Test Accuracy of class    21:  0% ( 0/ 5)
Test Accuracy of class    22: 100% ( 5/

## NICEEEE...
We got a nice accuracy for our model

## NOTE:  
 As mentioned in the analysis, I tried the Adam model (for lr 0.00005) and as pridicted, it was very slow -   
 These Were its epoch results - 

Epoch: 1 	Training Loss: 5.393564 	Validation Loss: 5.140303
Validation loss decreased (inf --> 5.140303).  Saving model ...

Epoch: 2 	Training Loss: 5.118051 	Validation Loss: 4.848226
Validation loss decreased (5.140303 --> 4.848226).  Saving model ...

Epoch: 3 	Training Loss: 4.935542 	Validation Loss: 4.700039
Validation loss decreased (4.848226 --> 4.700039).  Saving model ...

Epoch: 4 	Training Loss: 4.788703 	Validation Loss: 4.513692
Validation loss decreased (4.700039 --> 4.513692).  Saving model ...

Epoch: 5 	Training Loss: 4.682527 	Validation Loss: 4.396898
Validation loss decreased (4.513692 --> 4.396898).  Saving model ...

Epoch: 6 	Training Loss: 4.599165 	Validation Loss: 4.295371
Validation loss decreased (4.396898 --> 4.295371).  Saving model ...

Epoch: 7 	Training Loss: 4.530815 	Validation Loss: 4.229605
Validation loss decreased (4.295371 --> 4.229605).  Saving model ...

Epoch: 8 	Training Loss: 4.476235 	Validation Loss: 4.201569
Validation loss decreased (4.229605 --> 4.201569).  Saving model ...

Epoch: 9 	Training Loss: 4.422969 	Validation Loss: 4.143393
Validation loss decreased (4.201569 --> 4.143393).  Saving model ...

Epoch: 10 	Training Loss: 4.380741 	Validation Loss: 4.109100
Validation loss decreased (4.143393 --> 4.109100).  Saving model ...

## Therefore, I discarded the Adam model and trained the data with SGD which gave us the accuracy of about 90% !!