In [1]:
!pip install ultralytics



# Imports

In [32]:
import os
from PIL import Image
import numpy as np
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from ultralytics import YOLO
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [15]:
# Überprüfen, ob eine GPU verfügbar ist
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Laden der Modelle

In [16]:
# Laden des vortrainierten Modells und der entsprechenden Tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Laden des BLIP Modell und der entsprechenden Tokenizer
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

#Landen YOLO
model_yolo = YOLO('yolov8n.pt')  

In [35]:
# Modelle laden und auf mehrere GPUs verteilen
model = torch.nn.DataParallel(model).to(device)
blip_model = torch.nn.DataParallel(blip_model).to(device)

# Caption Generation

In [47]:
# Funktion zum Generieren der Bildbeschreibung mit ViT-GPT2
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)  # Auf GPU verschieben
    attention_mask = torch.ones(pixel_values.shape[:2], dtype=torch.long).to(device)  # Auf GPU verschieben
    output_ids = model.module.generate(pixel_values, attention_mask=attention_mask, max_length=16, num_beams=4)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

# Funktion zum Generieren der Bildbeschreibung mit BLIP
def generate_blip_caption(image_path):
    image = Image.open(image_path).convert("RGB") 
    inputs = blip_processor(images=image, return_tensors="pt").to(device)  # Auf GPU verschieben

    output_ids = blip_model.module.generate(**inputs, max_length=16, num_beams=4)
    caption = blip_processor.decode(output_ids[0], skip_special_tokens=True)
    return caption

# Funktion zur Objekterkennung und Generierung einfacher Bildunterschriften
def detect_objects(image_path):
    # Laden des Bildes
    image = Image.open(image_path).convert("RGB")
    image_np = np.array(image)

    # Anwenden des YOLOv8 auf das Bild
    results = model_yolo(image_np, verbose=False)  # Deaktiviert die Anzeige des Bildes

    # Extrahieren der erkannten Objekte
    detected_objects = []
    for result in results:
        for box in result.boxes:
            cls_id = int(box.cls[0])
            label = model_yolo.names[cls_id]
            detected_objects.append(label)

    return detected_objects

def generate_simple_caption(detected_objects):
    if not detected_objects:
        return "No objects detected in the image."
    
    unique_objects = set(detected_objects)
    object_counts = {obj: detected_objects.count(obj) for obj in unique_objects}
    
    caption = "The image contains "
    for obj, count in object_counts.items():
        if count == 1:
            caption += f"a {obj}, "
        else:
            caption += f"{count} {obj}s, "
    
    caption = caption.rstrip(', ') + '.'
    return caption


# Load und Evaluation Funktionen

In [19]:
# Funktion zum Laden der Bildunterschriften aus der Textdatei
def load_captions(caption_file):
    with open(caption_file, 'r') as file:
        lines = file.readlines()
    caption_dict = {}
    for line in lines[1:]:  # Überspringen der Header-Zeile
        parts = line.strip().split(',')
        image_id = parts[0]
        caption = ','.join(parts[1:])
        if image_id not in caption_dict:
            caption_dict[image_id] = []
        caption_dict[image_id].append(caption)
    return caption_dict

# Funktion zur Evaluierung der generierten Bildunterschriften mit detaillierter Analyse
def evaluate_model_detailed(caption_dict, image_dir, model_func, num_images=8000, smoothing_method=None):
    scores = []
    n_gram_scores = {1: [], 2: [], 3: [], 4: []}
    
    limited_keys = list(caption_dict.keys())[:num_images]
    
    if smoothing_method:
        smoothing_function = getattr(SmoothingFunction(), smoothing_method)
    else:
        smoothing_function = None
    
    for image_id in limited_keys:
        reference_captions = caption_dict[image_id]
        image_path = os.path.join(image_dir, image_id)
        generated_caption = model_func(image_path)
        
        # Berechnen der n-Gram Präzisionen
        precisions = []
        for n in range(1, 5):
            weights = [1.0 / n] * n + [0] * (4 - n)
            n_gram_score = sentence_bleu([ref.split() for ref in reference_captions], generated_caption.split(), weights=weights[:n], smoothing_function=smoothing_function)
            n_gram_scores[n].append(n_gram_score)
            precisions.append(n_gram_score)
        
        # Hinzufügen einer kleinen Konstante, um log(0) zu vermeiden
        precisions = [p if p > 0 else 1e-9 for p in precisions]
        
        # Berechnen des geometrischen Mittels der n-Gram Präzisionen
        geometric_mean = np.exp(np.mean(np.log(precisions)))
        
        # Brevity Penalty
        ref_lengths = [len(ref.split()) for ref in reference_captions]
        hyp_length = len(generated_caption.split())
        closest_ref_length = min(ref_lengths, key=lambda ref_len: (abs(ref_len - hyp_length), ref_len))
        if hyp_length > closest_ref_length:
            brevity_penalty = 1
        else:
            brevity_penalty = np.exp(1 - closest_ref_length / hyp_length)
        
        # BLEU Score
        bleu_score = brevity_penalty * geometric_mean
        scores.append(bleu_score)
    
    average_bleu = np.mean(scores)
    average_n_gram_scores = {n: np.mean(scores) for n, scores in n_gram_scores.items()}
    
    return average_bleu, average_n_gram_scores

In [20]:
# Pfad zum Flickr8k-Bilderverzeichnis und zur Bildunterschriften-Datei
image_dir = '/kaggle/input/flickr8k/Images'
caption_file = '/kaggle/input/flickr8k/captions.txt'
    
# Laden der Bildunterschriften
caption_dict = load_captions(caption_file)

# Evaluierungsergebnisse

In [21]:
# Berechnung der BLEU-Scores für das ViT-GPT2
average_bleu_method4, n_gram_scores_method4 = evaluate_model_detailed(caption_dict, image_dir, generate_caption, num_images=8000, smoothing_method='method4')

# Ausgabe der BLEU-Scores und n-Gramm-Präzisionen
print(f"Average BLEU (method4): {average_bleu_method4}")
print(f"1-Gram Precision (method4): {n_gram_scores_method4[1]}")
print(f"2-Gram Precision (method4): {n_gram_scores_method4[2]}")
print(f"3-Gram Precision (method4): {n_gram_scores_method4[3]}")
print(f"4-Gram Precision (method4): {n_gram_scores_method4[4]}")

Average BLEU (method4): 0.3493923512893618
1-Gram Precision (method4): 0.5522633022052464
2-Gram Precision (method4): 0.3836246752095355
3-Gram Precision (method4): 0.3136509277948999
4-Gram Precision (method4): 0.27369742485294796


In [22]:
# Berechnung der BLEU-Scores für das BLIP
average_bleu_method4, n_gram_scores_method4 = evaluate_model_detailed(caption_dict, image_dir, generate_blip_caption, num_images=8000, smoothing_method='method4')

# Ausgabe der BLEU-Scores und n-Gramm-Präzisionen
print(f"Average BLEU (method4) for BLIP: {average_bleu_method4}")
print(f"1-Gram Precision (method4): {n_gram_scores_method4[1]}")
print(f"2-Gram Precision (method4): {n_gram_scores_method4[2]}")
print(f"3-Gram Precision (method4): {n_gram_scores_method4[3]}")
print(f"4-Gram Precision (method4): {n_gram_scores_method4[4]}")

Average BLEU (method4) for BLIP: 0.32479506854341345
1-Gram Precision (method4): 0.5623407034727911
2-Gram Precision (method4): 0.4147418525669168
3-Gram Precision (method4): 0.3321488673375305
4-Gram Precision (method4): 0.2786913487389565


In [48]:
# Berechnung der BLEU-Scores für das YOLO
average_bleu_method4, n_gram_scores_method4 = evaluate_model_detailed(caption_dict, image_dir, lambda img_path: generate_simple_caption(detect_objects(img_path)), num_images=8000, smoothing_method='method4')
# Ausgabe der BLEU-Scores und n-Gramm-Präzisionen
print(f"Average BLEU (method4) for YOLO: {average_bleu_method4}")
print(f"1-Gram Precision (method4): {n_gram_scores_method4[1]}")
print(f"2-Gram Precision (method4): {n_gram_scores_method4[2]}")
print(f"3-Gram Precision (method4): {n_gram_scores_method4[3]}")
print(f"4-Gram Precision (method4): {n_gram_scores_method4[4]}")

Average BLEU (method4) for YOLO: 0.11739814950202587
1-Gram Precision (method4): 0.1419689757950447
2-Gram Precision (method4): 0.1503721345494763
3-Gram Precision (method4): 0.1432906352992302
4-Gram Precision (method4): 0.13340715040035322
