In [None]:
from pycocotools.coco import COCO
from data_loader import get_loader
from torchvision import transforms

# Define a transform to pre-process the testing images.
transform_test = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Watch for any changes in model.py, and re-load it automatically.
import os
import torch
from model import EncoderCNN, DecoderRNN

# Specify the saved models to load.
encoder_file = "encoder-1.pkl"
decoder_file = "decoder-1.pkl"

# Select appropriate values for the Python variables below.
embed_size = 1024
hidden_size = 1024

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder, and set each to inference mode.
encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, 1, device)
decoder.eval()

# Load the trained weights.
encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file), map_location=torch.device(device)))
decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file), map_location=torch.device(device)))

# Move models to GPU if CUDA is available.
encoder.to(device)
decoder.to(device)

In [None]:
from PIL import Image
from torchvision import transforms

In [None]:
def clean_sentence(output):
    sentence = ""
    for i in range(len(output)):
        word = data_loader.dataset.vocab.idx2word[output[i]]
        if word == data_loader.dataset.vocab.end_word:
            continue
        sentence += " " + word
    return sentence

In [None]:
def get_prediction(pil_image, image):
    plt.imshow(np.squeeze(pil_image))
    plt.title('Sample Image')
    plt.show()
    image = image.to(device)
    features = encoder(image).unsqueeze(1)
    output = decoder.sample(features)    
    sentence = clean_sentence(output)
    print(sentence)

In [None]:
# img sample 1
img1 = 'images/sample_test.jpg'
pil_img1 = Image.open(img1)
 
tensor_image1 = transform_test(pil_img1)
tensor_image1 = tensor_image1.unsqueeze(0)

In [None]:
get_prediction(pil_img1, tensor_image1)

In [None]:
# img sample 1
img2 = 'images/sample_test_2.jpg'
pil_img2 = Image.open(img2)
 
tensor_image2 = transform_test(pil_img2)
tensor_image2 = tensor_image2.unsqueeze(0)

In [None]:
get_prediction(pil_img2, tensor_image2)