In [1]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Laden des BLIP-Modells und des Tokenizers
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def generate_blip_caption(image_path):
    """Generiert eine Bildunterschrift für das gegebene Bild."""
    image = Image.open(image_path).convert("RGB") 
    inputs = blip_processor(images=image, return_tensors="pt")  

    output_ids = blip_model.generate(**inputs, max_length=16, num_beams=4)
    caption = blip_processor.decode(output_ids[0], skip_special_tokens=True)
    return caption

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import CLIPProcessor, CLIPModel
from ultralytics import YOLO
import numpy as np


In [24]:
# Laden des vortrainierten Modells und der entsprechenden Tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Laden des BLIP Modell und der entsprechenden Tokenizer
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


model_yolo = YOLO('yolov8n.pt')  

In [11]:
def generate_caption(image_path):
    # Laden und verarbeiten des Bild
    image = Image.open(image_path).convert("RGB")  # Stellt sicher, dass das Bild im RGB-Format ist
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values

    # Generieren der Bildbeschreibung
    output_ids = model.generate(pixel_values, max_length=16, num_beams=4)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

In [21]:
def generate_blip_caption(image_path):
    image = Image.open(image_path).convert("RGB") 
    inputs = blip_processor(images=image, return_tensors="pt")  

    output_ids = blip_model.generate(**inputs, max_length=16, num_beams=4)
    caption = blip_processor.decode(output_ids[0], skip_special_tokens=True)
    return caption



In [19]:
def detect_objects(image_path):
    # Lade das Bild
    image = Image.open(image_path).convert("RGB")
    
    # Konvertiere das Bild in ein NumPy-Array
    image_np = np.array(image)
    
    # Wende YOLOv8 auf das Bild an
    results = model_yolo(image_np)
    
    # Extrahiere die erkannten Objekte
    detected_objects = []
    for result in results:
        for box in result.boxes:
            cls_id = int(box.cls[0])
            label = model_yolo.names[cls_id]
            detected_objects.append(label)
    
    return detected_objects

def generate_simple_caption(detected_objects):
    if not detected_objects:
        return "No objects detected in the image."
    
    unique_objects = set(detected_objects)
    object_counts = {obj: detected_objects.count(obj) for obj in unique_objects}
    
    caption = "The image contains "
    for obj, count in object_counts.items():
        if count == 1:
            caption += f"a {obj}, "
        else:
            caption += f"{count} {obj}s, "
    
    caption = caption.rstrip(', ') + '.'
    return caption



In [2]:
# Testen der Funktion mit dem hochgeladenen Bild
image_path = 'Picture.png'
caption = generate_blip_caption(image_path)
print(f'Generated caption: {caption}')

Generated caption: a baby seal lying on the beach


In [22]:
caption = generate_blip_caption(image_path)
print(f'Generated caption: {caption}')

Generated caption: a baby seal lying on the beach


In [25]:
# Teste die Funktion mit einem Beispielbild
detected_objects = detect_objects(image_path)
simple_caption = generate_simple_caption(detected_objects)
print(f'Detected objects: {detected_objects}')
print(f'Simple caption: {simple_caption}')



0: 416x640 1 bear, 70.1ms
Speed: 7.5ms preprocess, 70.1ms inference, 6.7ms postprocess per image at shape (1, 3, 416, 640)
Detected objects: ['bear']
Simple caption: The image contains a bear.
