<a href="https://colab.research.google.com/github/aashishpiitkEigenlytics/lapsrn-document/blob/main/vgg16_fine_tune_last_five_conv_layers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import os, math, sys
import glob, itertools
import argparse, random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision.models import vgg19
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.utils import save_image, make_grid

import plotly
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from PIL import Image
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

random.seed(42)
import warnings
warnings.filterwarnings("ignore")

import math
from pathlib import Path
from torchsummary import summary
import torchvision
from PIL import Image, ImageOps

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# number of epochs of training
n_epochs = 50
# name of the dataset
dataset_path = "/content/train/test/"
# size of the batches
batch_size = 8
# adam: learning rate
lr = 0.00008
# adam: decay of first order momentum of gradient
b1 = 0.5
# adam: decay of second order momentum of gradient
b2 = 0.999
# epoch from which to start lr decay
decay_epoch = 100
# number of cpu threads to use during batch generation
n_cpu = 8
# high res. image height
hr_height = 64
# high res. image width
hr_width = 64
# number of image channels
channels = 1

# os.makedirs("images", exist_ok=True)
# os.makedirs("saved_models", exist_ok=True)

cuda = torch.cuda.is_available()
hr_shape = (hr_height, hr_width)



In [None]:
! unzip -q /content/drive/MyDrive/rvl_cdip_test_dataset/rvl_cdip_test_dataset.zip -d /content/train

In [None]:
doc2label = {
    'advertisement':0,
    'budget':1,
    'email':2,
    'file_folder':3,
    'form':4,
    'handwritten':5,
    'invoice':6,
    'letter':7,
    'memo':8,
    'news_article':9,
    'presentation':10,
    'questionnaire':11,
    'resume':12,
    'scientific_publication':13,
    'scientific_report':14,
    'specification':15
}

In [None]:
train_path = []
for path in Path('/content/train/content/train/test').rglob('*.tif'):
  target = str(str(path).split('/')[-2])
  train_path.append((path, doc2label[target]))

In [None]:
class ImageDataset(Dataset):
  def __init__(self, files):
    #super(ImageDataset, self)

    self.files = files
    self.trans = transforms.Compose([
                                transforms.Grayscale(),
                                transforms.Resize((780,600)), 
                                transforms.ToTensor()
    ])
    self.trans2 = transforms.Resize((227,227))

    #self.trans1 = transforms.ToTensor()
  
  def __getitem__(self, index):
    img = Image.open(self.files[index % len(self.files)][0])
    target = self.files[index % len(self.files)][1]
    
    output_dict = {
        'targets' : torch.tensor(target),
        'holistic' : self.create_holistic(img),
    }

    return output_dict
  
  def create_header(self, x):
    # trans1 = transforms.ToTensor()
    x = self.trans(x)

    x = x[:][:, :, :256]
    return self.trans2(x)


  def create_right_half(self, x):
    x = self.trans(x)

    x = x[:][:, -300:, 100:-100]
    return self.trans2(x)
  def create_left_half(self, x):
    x = self.trans(x)

    x = x[:][:, :300, 100:-100]
    return self.trans2(x)
  def create_footer(self, x):
    x = self.trans(x)

    x = x[:][:, :, -256:]
    return self.trans2(x)

  def create_holistic(self, x):
    
    return self.trans(x)

  def __len__(self):
      return len(self.files)


In [None]:
# train_path = train_path[:len(train_path)//5]

In [None]:
len(train_path)

39996

In [None]:
## incorporate the labels somehow when preparing the dataset usign ImageDataset

In [None]:
train_paths, test_paths = train_test_split(train_path, test_size=0.1, random_state=42)
train_paths = train_paths[:len(train_paths)]
test_paths = test_paths[:len(test_paths)//4]

#train_paths, test_paths = train_test_split(sorted(glob.glob(dataset_path + "/*.*")), test_size=0.02, random_state=42)
train_dataloader = DataLoader(ImageDataset(train_paths), batch_size=batch_size, shuffle=True, num_workers=n_cpu)
test_dataloader = DataLoader(ImageDataset(test_paths), batch_size=int(batch_size), shuffle=True, num_workers=n_cpu)

In [None]:
len(train_dataloader)

4500

In [None]:
print(len(test_dataloader))

125


In [None]:
class VGG16_fine_tune_last_layer(nn.Module):
  def __init__(self):
    super(VGG16_fine_tune_last_layer, self).__init__()

    vgg16_model = torchvision.models.vgg16(pretrained = True)
    self.feature_extractor = nn.Sequential(*list(vgg16_model.features.children()))
    self.avg_pool = vgg16_model.avgpool#nn.Sequential(*list(vgg16_model.avgpool.children()))
    self.classifier = nn.Sequential(*list(vgg16_model.classifier.children()))

    for i,(name, param) in enumerate(self.feature_extractor.named_parameters()):
      if(i<16):
        param.requires_grad = False
    for i,(name, param) in enumerate(self.feature_extractor.named_parameters()):
      if(i>=16):
        param.requires_grad = True
    for name, param in self.avg_pool.named_parameters():
      param.requires_grad = True
    for name, param in self.classifier.named_parameters():
      param.requires_grad = False

    self.last_layer = nn.Sequential(nn.Linear(25088, 256, bias=True),
                                    nn.ReLU(),
                                    nn.Linear(256,128, bias=True),
                                    nn.ReLU(),
                                    nn.Linear(128,16, bias=True))

  def forward(self, x):
    x = self.feature_extractor(x)
    x = self.avg_pool(x)
    #x = self.classifier(torch.flatten(x, start_dim=1))
    
    x = self.last_layer(torch.flatten(x, start_dim=1))

    #output = self.last_layer(x)
    return x


In [None]:
vgg16_fine_tune_last_layer = VGG16_fine_tune_last_layer().to(device)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))




In [None]:
print(vgg16_fine_tune_last_layer)

VGG16_fine_tune_last_layer(
  (feature_extractor): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2,

In [None]:
print(type(vgg16_fine_tune_last_layer.feature_extractor.named_parameters()))

<class 'generator'>


In [None]:
for name, param in vgg16_fine_tune_last_layer.named_parameters():
    if param.requires_grad:
        print(name)

feature_extractor.19.weight
feature_extractor.19.bias
feature_extractor.21.weight
feature_extractor.21.bias
feature_extractor.24.weight
feature_extractor.24.bias
feature_extractor.26.weight
feature_extractor.26.bias
feature_extractor.28.weight
feature_extractor.28.bias
last_layer.0.weight
last_layer.0.bias
last_layer.2.weight
last_layer.2.bias
last_layer.4.weight
last_layer.4.bias


In [None]:
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, vgg16_fine_tune_last_layer.parameters()), 
                             lr=lr, betas=(b1, b2))

criterion = nn.CrossEntropyLoss()

In [None]:
# torch.cuda.empty_cache()

In [None]:
# ## loading the weights
# checkpoint = torch.load("/content/drive/MyDrive/vgg16_last_layer_fine_tune_rvl_cdip_checkpt16.pt",map_location=torch.device(device))

# vgg16_fine_tune_last_layer.load_state_dict(checkpoint['vgg16_fine_tune_last_layer'])
# # vgg16_fine_tune_last_layer.load_state_dict()
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# vgg16_fine_tune_last_layer.train()

In [None]:
# for param_tensor in vgg16_fine_tune_last_layer.state_dict():
#   print(param_tensor, "\t", vgg16_fine_tune_last_layer.state_dict()[param_tensor].size())

In [None]:
# # save only model.state_dict()['last_layer.weight'] and model.state_dict()['last_layer.bias']
# new_state_dict = {}
# layers = ['last_layer.weight', 'last_layer.bias']
# for layer in layers:
#   print(vgg16_fine_tune_last_layer.state_dict()[layer].shape)
#   new_state_dict[layer] = vgg16_fine_tune_last_layer.state_dict()[layer]

In [None]:
# for param_tensor in new_state_dict:
#   print(param_tensor, "\t", new_state_dict[param_tensor].size())

In [None]:
# ## loading the weights
# checkpoint = torch.load("/content/drive/MyDrive/vgg16_with_normalisation_last_layer/checkpt8.pt",map_location=torch.device(device))

# new_state_dict = checkpoint['new_state_dict']
# old_dict = vgg16_fine_tune_last_layer.state_dict()
# layers = ['last_layer.0.weight', 'last_layer.0.bias', 'last_layer.2.weight', 'last_layer.2.bias', 'last_layer.4.weight', 'last_layer.4.bias']
# for layer in layers:
#   old_dict[layer] = new_state_dict[layer] 
# vgg16_fine_tune_last_layer.load_state_dict(old_dict)


# #optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# vgg16_fine_tune_last_layer.train()



In [None]:
## we will save and load this dictionary only

In [None]:
def accuracy(x, targets):
  # receive the outputs of linear layer
  # first apply softmax then take argmax to know the index
  # then compare
  # input will be (batch_size, )
  probs = nn.functional.softmax(x)
  labels = torch.argmax(probs, dim=1)

  count=0
  for label, target in zip(labels, targets):
    #print(label, target)
    if(label.item() == target.item()):
      count+=1
  
  return count/labels.shape[0]



In [None]:
# temp = torch.randn(2,1,227,227)

In [None]:
# _mean = temp[:,0,:,:].mean(axis=0)
# print(_mean.shape)

In [None]:
train_losses, test_losses = [], []
train_acc, test_acc = [], []
test_counter = [idx*len(test_dataloader.dataset) for idx in range(1, n_epochs+1)]
train_counter = [idx*len(train_dataloader.dataset) for idx in range(1, n_epochs+1)]

n_epochs = 20
for epoch in range(2,n_epochs):

  # Training loop
  loss = 0
  acc = 0
  tqdm_bar = tqdm(train_dataloader, desc=f'Training Epoch {epoch} ', total=int(len(train_dataloader)))

  for batch_idx, imgs in enumerate(tqdm_bar):

    optimizer.zero_grad()


    vgg16_fine_tune_last_layer.train();

    for key in imgs:
      imgs[key] = imgs[key].to(device)

    holistic_img = imgs['holistic']
    targets = imgs['targets']
    _mean = holistic_img[:,0,:,:].mean(axis=0)
    _std  = holistic_img[:,0,:,:].std(axis=0)
    holistic_img[:,0,:,:] = (holistic_img[:,0,:,:]-_mean)/_std

    ## convert the 1 channel image to 3 channel image
    holistic_img_3 = holistic_img.repeat_interleave(3, dim=1)
    #holistic_img_3[:, 1:3, :, :] = 0

    prediction = vgg16_fine_tune_last_layer(holistic_img_3.to(device))

    loss_calc = criterion(prediction.to(device), targets.to(device))

    loss_calc.backward()
    optimizer.step()

    loss += loss_calc.item()
    acc += accuracy(prediction.to(device), targets.to(device))
    tqdm_bar.set_postfix(loss=loss/(batch_idx+1), acc = acc/(batch_idx+1))

  train_losses.append(loss/len(train_dataloader)) ## divided by total number of batches



  


  ## Testing loop
  loss = 0
  acc = 0
  tqdm_bar = tqdm(test_dataloader, desc=f'Testing Epoch {epoch} ', total=int(len(test_dataloader)))

  for batch_idx, imgs in enumerate(tqdm_bar):

    vgg16_fine_tune_last_layer.eval();
    
    for key in imgs:
      imgs[key] = imgs[key].to(device)

    holistic_img = imgs['holistic']
    targets = imgs['targets']
    _mean = holistic_img[:,0,:,:].mean(axis=0)
    _std  = holistic_img[:,0,:,:].std(axis=0)
    holistic_img[:,0,:,:] = (holistic_img[:,0,:,:]-_mean)/_std
    
    ## convert the 1 channel image to 3 channel image
    holistic_img_3 = holistic_img.repeat_interleave(3, dim=1)
    
    #holistic_img_3[:, 1:3, :, :] = 0
    
    prediction = vgg16_fine_tune_last_layer(holistic_img_3.to(device))

    loss_calc = criterion(prediction.to(device), targets.to(device))

    loss += loss_calc.item()
    acc += accuracy(prediction.to(device), targets.to(device))
    tqdm_bar.set_postfix(loss=loss/(batch_idx+1), acc = acc/(batch_idx+1))

  test_losses.append(loss/len(test_dataloader)) ## divided by total number of batches


  #new_state_dict = {}
  # layers = ['last_layer.weight', 'last_layer.bias']
  # for layer in layers:
  #   new_state_dict[layer] = vgg16_fine_tune_last_layer.state_dict()[layer]

  if(True):
    torch.save({
          
          'vgg16_fine_tune_last_layer' : vgg16_fine_tune_last_layer.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'train_loss' : train_losses[-1],
          'test_loss' : test_losses[-1],
          'epoch': epoch+1,
          }, f"/content/drive/MyDrive/vgg16_fine_tune_last_five_conv_layers/checkpt{epoch+1}.pt")

HBox(children=(FloatProgress(value=0.0, description='Training Epoch 2 ', max=4500.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Testing Epoch 2 ', max=125.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Training Epoch 3 ', max=4500.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Testing Epoch 3 ', max=125.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Training Epoch 4 ', max=4500.0, style=ProgressStyle(descr…

In [None]:
## load weights
checkpoint = torch.load("/content/drive/MyDrive/vgg16_fine_tune_last_five_conv_layers/checkpt4.pt",map_location=torch.device(device))
vgg16_fine_tune_last_layer.load_state_dict(checkpoint['vgg16_fine_tune_last_layer'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
train_losses, test_losses = [], []
train_acc, test_acc = [], []
test_counter = [idx*len(test_dataloader.dataset) for idx in range(1, n_epochs+1)]
train_counter = [idx*len(train_dataloader.dataset) for idx in range(1, n_epochs+1)]

n_epochs = 24
for epoch in range(4,n_epochs):

  # Training loop
  loss = 0
  acc = 0
  tqdm_bar = tqdm(train_dataloader, desc=f'Training Epoch {epoch} ', total=int(len(train_dataloader)))

  for batch_idx, imgs in enumerate(tqdm_bar):

    optimizer.zero_grad()


    vgg16_fine_tune_last_layer.train();

    for key in imgs:
      imgs[key] = imgs[key].to(device)

    holistic_img = imgs['holistic']
    targets = imgs['targets']
    _mean = holistic_img[:,0,:,:].mean(axis=0)
    _std  = holistic_img[:,0,:,:].std(axis=0)
    holistic_img[:,0,:,:] = (holistic_img[:,0,:,:]-_mean)/_std

    ## convert the 1 channel image to 3 channel image
    holistic_img_3 = holistic_img.repeat_interleave(3, dim=1)
    #holistic_img_3[:, 1:3, :, :] = 0

    prediction = vgg16_fine_tune_last_layer(holistic_img_3.to(device))

    loss_calc = criterion(prediction.to(device), targets.to(device))

    loss_calc.backward()
    optimizer.step()

    loss += loss_calc.item()
    acc += accuracy(prediction.to(device), targets.to(device))
    tqdm_bar.set_postfix(loss=loss/(batch_idx+1), acc = acc/(batch_idx+1))

  train_losses.append(loss/len(train_dataloader)) ## divided by total number of batches



  


  ## Testing loop
  loss = 0
  acc = 0
  tqdm_bar = tqdm(test_dataloader, desc=f'Testing Epoch {epoch} ', total=int(len(test_dataloader)))

  for batch_idx, imgs in enumerate(tqdm_bar):

    vgg16_fine_tune_last_layer.eval();
    
    for key in imgs:
      imgs[key] = imgs[key].to(device)

    holistic_img = imgs['holistic']
    targets = imgs['targets']
    _mean = holistic_img[:,0,:,:].mean(axis=0)
    _std  = holistic_img[:,0,:,:].std(axis=0)
    holistic_img[:,0,:,:] = (holistic_img[:,0,:,:]-_mean)/_std
    
    ## convert the 1 channel image to 3 channel image
    holistic_img_3 = holistic_img.repeat_interleave(3, dim=1)
    
    #holistic_img_3[:, 1:3, :, :] = 0
    
    prediction = vgg16_fine_tune_last_layer(holistic_img_3.to(device))

    loss_calc = criterion(prediction.to(device), targets.to(device))

    loss += loss_calc.item()
    acc += accuracy(prediction.to(device), targets.to(device))
    tqdm_bar.set_postfix(loss=loss/(batch_idx+1), acc = acc/(batch_idx+1))

  test_losses.append(loss/len(test_dataloader)) ## divided by total number of batches


  #new_state_dict = {}
  # layers = ['last_layer.weight', 'last_layer.bias']
  # for layer in layers:
  #   new_state_dict[layer] = vgg16_fine_tune_last_layer.state_dict()[layer]

  if(True):
    torch.save({
          
          'vgg16_fine_tune_last_layer' : vgg16_fine_tune_last_layer.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'train_loss' : train_losses[-1],
          'test_loss' : test_losses[-1],
          'epoch': epoch+1,
          }, f"/content/drive/MyDrive/vgg16_fine_tune_last_five_conv_layers/checkpt{epoch+1}.pt")

HBox(children=(FloatProgress(value=0.0, description='Training Epoch 4 ', max=4500.0, style=ProgressStyle(descr…

KeyboardInterrupt: ignored