In [61]:
import torch
from transformers import BertTokenizer

# Load the saved model (without DDP)
model = torch.load("vision_language_model_full_v4_15.pth", map_location=torch.device('cpu'))


# Check if GPU is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# Load the saved model (without DDP) and move it to the device (GPU if available)
#model = torch.load("vision_language_model_full_v3_1.pth", map_location=device)

model.eval()  # Set the model to evaluation mode

VisionLanguageModel(
  (vision_encoder): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(392, 768)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-11): 12 x SiglipEncoderLayer(
            (self_attn): SiglipAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=

In [65]:
from PIL import Image
import torch
from torchvision import transforms

# Define the preprocessing pipeline (same as used during training)
preprocess_image = transforms.Compose([
    transforms.Resize((392, 196)),  # Resize the image to match the model's input size
    transforms.ToTensor(),          # Convert the image to a PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
])

# Function to preprocess a single image
def preprocess_input_image(image_path):
    image = Image.open(image_path).convert("RGB")  # Ensure it's in RGB format
    pixel_values = preprocess_image(image).unsqueeze(0)  # Add a batch dimension
    return pixel_values

# Example usage:
image_path = "Filtered_Outfits/14708/complete.jpg"
pixel_values = preprocess_input_image(image_path)

#14708, 2608 on 10. 

In [66]:
import torch.nn.functional as F
# Load the tokenizer used during training
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def generate_caption(model, pixel_values, tokenizer, max_length=392):
    model.eval()  # Set model to evaluation mode

    # Start with the [CLS] token (the beginning of a sequence)
    generated_ids = torch.tensor([[tokenizer.cls_token_id]], device=pixel_values.device)  # Ensure it is on the same device

    
    for _ in range(max_length - 1):  # Generate tokens up to max_length
        # Pad the current sequence to match the vision features' sequence length
        input_ids = F.pad(generated_ids, (0, max_length - generated_ids.size(1)), value=tokenizer.pad_token_id)
        
        # Embed the padded input tokens
        embedded_input_ids = model.token_embedding(input_ids)
        
        # Get vision features from the image
        vision_features = model.vision_encoder(pixel_values)
        
        # Generate output using text decoder
        outputs = model.text_decoder(embedded_input_ids, vision_features)
        
        logits = model.fc_out(outputs)
        
        # Get the next token prediction (argmax over the vocabulary dimension)
        next_token_id = logits.argmax(dim=-1)[:, generated_ids.size(1) - 1].unsqueeze(0)  # Last token
        print(next_token_id)
        
        # Append the generated token to the sequence
        generated_ids = torch.cat([generated_ids, next_token_id], dim=1)

        current_output = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens=True)
        print(f"Current Generated Output: {current_output}")
        
        # Stop generation if the [SEP] token is produced
        if next_token_id.item() == tokenizer.sep_token_id or next_token_id.item() == 0:
            break
    
    # Decode the generated token IDs back into a readable caption
    generated_caption = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens=True)
    
    return generated_caption



In [67]:

# Generate the caption
caption = generate_caption(model, pixel_values, tokenizer)
print("Generated Caption:", caption)


tensor([[1045]])
Current Generated Output: i
tensor([[1005]])
Current Generated Output: i '
tensor([[1049]])
Current Generated Output: i'm
tensor([[2559]])
Current Generated Output: i'm looking
tensor([[2005]])
Current Generated Output: i'm looking for
tensor([[1037]])
Current Generated Output: i'm looking for a
tensor([[10017]])
Current Generated Output: i'm looking for a casual
tensor([[1998]])
Current Generated Output: i'm looking for a casual and
tensor([[6625]])
Current Generated Output: i'm looking for a casual and comfortable
tensor([[11018]])
Current Generated Output: i'm looking for a casual and comfortable outfit
tensor([[2005]])
Current Generated Output: i'm looking for a casual and comfortable outfit for
tensor([[1037]])
Current Generated Output: i'm looking for a casual and comfortable outfit for a
tensor([[10017]])
Current Generated Output: i'm looking for a casual and comfortable outfit for a casual
tensor([[2154]])
Current Generated Output: i'm looking for a casual and 