#### Need to install pycocotools

first:
conda install git
then:
pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI"

In [1]:
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
from data_loader_ddr import get_loader 
from build_vocab_ddr import Vocabulary
from model import EncoderCNN, DecoderRNN
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
import nltk

In [2]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
os.getcwd()

'C:\\Users\\bisedab\\Desktop\\pytorch-tutorial\\tutorials\\03-advanced\\image_captioning'

In [4]:
#nltk.download('punkt')

# Build Vocabulary

In [5]:
!python build_vocab_ddr.py

Total vocabulary size: 36
Saved the vocabulary wrapper to './data/vocab_ddr.pkl'


# Resize Images

In [6]:
#!python resize_ddr.py

# Define Args for Running in Notebook

In [7]:
class Args:
    model_path = 'ddrmodels/'
    crop_size = 224
    vocab_path = 'data/vocab_ddr.pkl'
    image_dir = 'data/resizedddr'
    caption_path = 'data/annotations/spectrogram_2.csv'
    log_step = 10
    save_step = 100
    embed_size = 256
    hidden_size = 512
    num_layers = 3
    num_epochs = 1
    batch_size = 2
    num_workers = 2
    learning_rate = 0.001
    
args=Args()

# Test Data Loaders

In [8]:
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

In [None]:
vocab.idx2word[10]

In [None]:
# def main(args):
#     # Create model directory
#     if not os.path.exists(args.model_path):
#         os.makedirs(args.model_path)
    
#     # Image preprocessing, normalization for the pretrained resnet
#     transform = transforms.Compose([ 
#         transforms.RandomCrop(args.crop_size),
#         transforms.RandomHorizontalFlip(), 
#         transforms.ToTensor(), 
#         transforms.Normalize((0.485, 0.456, 0.406), 
#                              (0.229, 0.224, 0.225))])
    
#     # Load vocabulary wrapper
#     with open(args.vocab_path, 'rb') as f:
#         vocab = pickle.load(f)
    
#     # Build data loader
#     data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
#                              transform, args.batch_size,
#                              shuffle=True, num_workers=args.num_workers) 

   
#     for i, (images, captions, lengths) in enumerate(data_loader):
            
#         # Set mini-batch dataset
#         images = images.to(device)
#         captions = captions.to(device)
#         targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
#         print(images, captions, targets)


# if __name__ == '__main__':
#     main(args)

# Train the Model

In [None]:
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
    

In [None]:
    d_decoder = nn.Sequential(decoder,
                              nn.Linear(len(vocab), 1),
                              nn.Sigmoid())

In [None]:
args.embed_size, args.hidden_size, len(vocab), args.num_layers

In [None]:
decoder

In [None]:
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            #print(len(images), len(captions), len(targets))
            
            #Skip batches of 1 item
            if len(images) > 1:
                # Forward, backward and optimize
                print(images.shape)
                features = encoder(images)
                print(features.shape)
                print(captions.shape)
                print(lengths)
                outputs = decoder(features, captions, lengths)
                print(outputs.shape)
                print(targets.shape)
                loss = criterion(outputs, targets)
                decoder.zero_grad()
                encoder.zero_grad()
                loss.backward()
                optimizer.step()
                input("Press Enter to continue...")

                # Print log info
                if i % args.log_step == 0:
                    print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                          .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) 
                
            # Save the model checkpoints
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))


if __name__ == '__main__':
    main(args)

torch.Size([2, 3, 224, 224])
torch.Size([2, 256])
torch.Size([2, 2421])
[2421, 1875]
torch.Size([4296, 36])
torch.Size([4296])
Press Enter to continue...
Epoch [0/1], Step [0/375], Loss: 3.5422, Perplexity: 34.5417
torch.Size([2, 3, 224, 224])
torch.Size([2, 256])
torch.Size([2, 1653])
[1653, 1503]
torch.Size([3156, 36])
torch.Size([3156])
Press Enter to continue...
torch.Size([2, 3, 224, 224])
torch.Size([2, 256])
torch.Size([2, 1943])
[1943, 1614]
torch.Size([3557, 36])
torch.Size([3557])


# Test Model

In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np 
import argparse
import pickle 
import os
from torchvision import transforms 
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from PIL import Image


In [None]:
class Args:
    image = 'C:/Users/bisedab/Desktop/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedddr/A_1.png'
    encoder_path = 'ddrmodels/encoder-1-300.ckpt'
    decoder_path = 'ddrmodels/decoder-1-300.ckpt'
    vocab_path = 'data/vocab_ddr.pkl'
    embed_size = 256
    hidden_size = 512
    num_layers = 3

args=Args()

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_image(image_path, transform=None):
    image = Image.open(image_path)
    image = image.convert('RGB')
    image = image.resize([224, 224], Image.LANCZOS)
    
    if transform is not None:
        image = transform(image).unsqueeze(0)
    
    return image

def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    #image = image.convert('RGB')
    image_tensor = image.to(device)
    
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
    
if __name__ == '__main__':

    main(args)