In [None]:
# Make sure to install the required packages first
# pip install transformers torch torchvision pillow requests

import torch
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
from io import BytesIO

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

# Load BLIP model and processor
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# Example Image URL (ensure this URL points to a valid image)
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/600px-PNG_transparency_demonstration_1.png"
# image_url ="\home\amehmood\Research\oldCdrive\ComputerVision\AirPlane.jpg"
image_url ="/mnt/d/FY2024/DataSet2024/dog vs cat/dataset/training_set/dogs/dog.64.jpg"
try:
    response = requests.get(image_url)
    response.raise_for_status()  # Check for HTTP errors
    image = Image.open(BytesIO(response.content))  # Load image from response
except Exception as e:
    print(f"Error loading image: {e}")
    exit()

# ==========================
# Demonstrate CLIP
# ==========================
# Prepare inputs for CLIP
texts = ["a logo", "a dog", "a cat"]
inputs = clip_processor(text=texts, images=image, return_tensors="pt", padding=True)

# Get CLIP predictions
with torch.no_grad():
    outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image  # Image-to-text similarity
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities

print("CLIP Predictions:")
for i, text in enumerate(texts):
    print(f"Probability of '{text}': {probs[0][i].item():.4f}")

# ==========================
# Demonstrate BLIP
# ==========================
# Prepare inputs for BLIP
blip_inputs = blip_processor(images=image, return_tensors="pt")

# Generate captions
with torch.no_grad():
    generated_ids = blip_model.generate(**blip_inputs)
    caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)

print("\nBLIP Caption:")
print(caption)


In [1]:
# Make sure to install the required packages first
# pip install transformers torch torchvision pillow requests

import torch
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import os
import requests
from io import BytesIO

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

# Load BLIP model and processor
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# Update the image path to point to a file in Drive D via WSL
image_path = "/mnt/d/FY2024/DataSet2024/dog vs cat/dataset/training_set/dogs/dog.64.jpg"

# Open image using PIL
try:
    if os.path.exists(image_path):
        image = Image.open(image_path)
    else:
        print(f"Image file not found at {image_path}")
        exit()
except Exception as e:
    print(f"Error loading image: {e}")
    exit()

# ==========================
# Demonstrate CLIP
# ==========================
# Prepare inputs for CLIP
texts = ["a logo", "a dog", "a cat"]
inputs = clip_processor(text=texts, images=image, return_tensors="pt", padding=True)

# Get CLIP predictions
with torch.no_grad():
    outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image  # Image-to-text similarity
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities

print("CLIP Predictions:")
for i, text in enumerate(texts):
    print(f"Probability of '{text}': {probs[0][i].item():.4f}")

# ==========================
# Demonstrate BLIP
# ==========================
# Prepare inputs for BLIP
blip_inputs = blip_processor(images=image, return_tensors="pt")

# Generate captions
with torch.no_grad():
    generated_ids = blip_model.generate(**blip_inputs)
    caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)

print("\nBLIP Caption:")
print(caption)


2024-09-23 14:46:56.676467: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-23 14:46:56.678044: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-23 14:46:56.713351: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


CLIP Predictions:
Probability of 'a logo': 0.0018
Probability of 'a dog': 0.9938
Probability of 'a cat': 0.0044





BLIP Caption:
a dog is sitting in the grass with its owner


In [11]:
# Make sure to install the required packages first
# pip install transformers torch torchvision pillow requests

import torch
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import random
import os

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

# Load BLIP model and processor
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# Define the path to the dataset folder
dataset_path = "/mnt/d/FY2024/DataSet2024/dog vs cat/dataset/test_set/imgs"

# Get a list of all image files in the dataset folder
image_files = [f for f in os.listdir(dataset_path) if f.endswith(('.jpg', '.png', '.jpeg'))]

# Randomly select 10 images from the list
random_image_files = random.sample(image_files, 10)

# Loop through the randomly selected images
for image_file in random_image_files:
    image_path = os.path.join(dataset_path, image_file)
    
    try:
        # Load the image using PIL
        image = Image.open(image_path)

        print(f"Processing image: {image_file}")
        
        # ==========================
        # Demonstrate CLIP
        # ==========================
        # Prepare inputs for CLIP
        texts = ["a logo", "a dog", "a cat"]  # You can modify the texts to match your dataset
        inputs = clip_processor(text=texts, images=image, return_tensors="pt", padding=True)

        # Get CLIP predictions
        with torch.no_grad():
            outputs = clip_model(**inputs)
            logits_per_image = outputs.logits_per_image  # Image-to-text similarity
            probs = logits_per_image.softmax(dim=1)  # Convert to probabilities

        # Print CLIP results
        print(f"\nCLIP Predictions for {image_file}:")
        for i, text in enumerate(texts):
            print(f"Probability of '{text}': {probs[0][i].item():.4f}")

        # ==========================
        # Demonstrate BLIP
        # ==========================
        # Prepare inputs for BLIP
        blip_inputs = blip_processor(images=image, return_tensors="pt")

        # Generate captions
        with torch.no_grad():
            generated_ids = blip_model.generate(**blip_inputs)
            caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)

        # Print BLIP results
        print(f"\nBLIP Caption for {image_file}: {caption}")
    
    except Exception as e:
        print(f"Error processing {image_file}: {e}")


Processing image: dog.4137.jpg

CLIP Predictions for dog.4137.jpg:
Probability of 'a logo': 0.0006
Probability of 'a dog': 0.9938
Probability of 'a cat': 0.0057

BLIP Caption for dog.4137.jpg: a dog sitting in the grass near a fence
Processing image: dog.4116.jpg

CLIP Predictions for dog.4116.jpg:
Probability of 'a logo': 0.0005
Probability of 'a dog': 0.8899
Probability of 'a cat': 0.1097

BLIP Caption for dog.4116.jpg: a small white dog standing next to a fence
Processing image: WVlogo.jpg

CLIP Predictions for WVlogo.jpg:
Probability of 'a logo': 0.9985
Probability of 'a dog': 0.0008
Probability of 'a cat': 0.0007

BLIP Caption for WVlogo.jpg: jaguar logo on a building
Processing image: dog.4132.jpg

CLIP Predictions for dog.4132.jpg:
Probability of 'a logo': 0.0004
Probability of 'a dog': 0.9956
Probability of 'a cat': 0.0040

BLIP Caption for dog.4132.jpg: a dog with a blue collar and a white nose
Processing image: cat.4122.jpg

CLIP Predictions for cat.4122.jpg:
Probability of '