In [4]:
# usage.py

import torch
import pickle
from torchvision import transforms
from model import CNNEncoder, LSTMDecoder
from outputgeneration import generate_caption

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 1) Load vocab
    with open("vocab.pkl", "rb") as f:
        word2idx, idx2word = pickle.load(f)
    vocab_size = len(word2idx)
    
    # 2) Re-create model architecture (must match training)
    embed_size = 256
    hidden_size = 512
    encoder = CNNEncoder(embed_size=embed_size).to(device)
    decoder = LSTMDecoder(embed_size=embed_size, hidden_size=hidden_size, vocab_size=vocab_size).to(device)
    
    # 3) Load the saved state dicts
    encoder.load_state_dict(torch.load("encoder.pth", map_location=device))
    decoder.load_state_dict(torch.load("decoder.pth", map_location=device))
    print("Models loaded successfully.")
    
    # 4) Define transforms (same as training)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    
    # 5) Path to test image
    image_path = r"C:\Users\olawa\Downloads\Asa.jpg"
    
    # 6) Generate caption
    caption = generate_caption(
        encoder=encoder,
        decoder=decoder,
        image_path=image_path,
        transform=transform,
        device=device,
        word2idx=word2idx,
        idx2word=idx2word,
        max_length=20
    )
    
    print("Generated Caption:", caption)

if __name__ == "__main__":
    main()


  encoder.load_state_dict(torch.load("encoder.pth", map_location=device))
  decoder.load_state_dict(torch.load("decoder.pth", map_location=device))


Models loaded successfully.
DEBUG: encoder output shape = torch.Size([1, 256])
DEBUG: initial current_input shape = torch.Size([1, 1])
DEBUG: Step 0: word_embed shape = torch.Size([1, 1, 256])
DEBUG: Step 1: word_embed shape = torch.Size([1, 1, 256])
DEBUG: Step 2: word_embed shape = torch.Size([1, 1, 256])
DEBUG: Step 3: word_embed shape = torch.Size([1, 1, 256])
DEBUG: Step 4: word_embed shape = torch.Size([1, 1, 256])
DEBUG: Step 5: word_embed shape = torch.Size([1, 1, 256])
DEBUG: Step 6: word_embed shape = torch.Size([1, 1, 256])
DEBUG: Step 7: word_embed shape = torch.Size([1, 1, 256])
DEBUG: Step 8: word_embed shape = torch.Size([1, 1, 256])
Generated Caption: a man of a a a a a
