In [5]:
# Install: pip install transformers torch pillow

import torch
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, CLIPModel, CLIPProcessor
from PIL import Image

# Load models
print("Loading models...")
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

print("✅ Models loaded!")

# Your inputs
image_path = r"C:\Users\AbdulRahman\Desktop\tst1.jpg"
user_text = "Amazing romantic dinner! #restaurant #romantic #datenight"

print(f"📸 Image: {image_path}")
print(f"💬 User text: {user_text}")

Loading models...
✅ Models loaded!
📸 Image: C:\Users\AbdulRahman\Desktop\tst1.jpg
💬 User text: Amazing romantic dinner! #restaurant #romantic #datenight


In [6]:
# Load and analyze image
image = Image.open(image_path).convert('RGB')

# Step 1: Get image description using BLIP
print("🔍 STEP 1: Image Description")
print("-" * 40)

inputs = caption_processor(image, return_tensors="pt")
out = caption_model.generate(**inputs, max_length=50)
image_description = caption_processor.decode(out[0], skip_special_tokens=True)

print(f"🤖 AI sees in image: '{image_description}'")

# Step 2: Visual classification using CLIP
print(f"\n🔍 STEP 2: Visual Classification Scores")
print("-" * 40)

# Define visual labels
visual_labels = [
    "a restaurant with dining tables",
    "a coffee shop or cafe", 
    "a bar with drinks",
    "a fitness gym with equipment",
    "a spa or beauty salon",
    "a retail store with products",
    "romantic atmosphere with candles",
    "family place with kids",
    "trendy modern design",
    "casual comfortable setting",
    "upscale elegant venue"
]

visual_results = {}

for label in visual_labels:
    # Use CLIP to score image against each visual description
    inputs = clip_processor(
        text=[label, "something completely different"],
        images=[image, image],
        return_tensors="pt",
        padding=True
    )
    
    with torch.no_grad():
        outputs = clip_model(**inputs)
        probs = outputs.logits_per_image.softmax(dim=-1)
        score = probs[0, 0].item()  # Score for the positive label
    
    # Clean label name
    clean_name = label.replace("a ", "").replace(" with", "").split()[0].title()
    if "romantic" in label:
        clean_name = "Romantic"
    elif "family" in label:
        clean_name = "Family"
    elif "trendy" in label:
        clean_name = "Trendy"
    elif "casual" in label:
        clean_name = "Casual"
    elif "upscale" in label:
        clean_name = "Upscale"
    
    visual_results[clean_name] = round(score, 3)
    print(f"   {clean_name}: {score:.3f}")

print(f"\n📊 VISUAL SCORES SUMMARY:")
sorted_visual = sorted(visual_results.items(), key=lambda x: x[1], reverse=True)
for name, score in sorted_visual[:5]:
    print(f"   🏆 {name}: {score}")

🔍 STEP 1: Image Description
----------------------------------------
🤖 AI sees in image: 'romantic couple in restaurant'

🔍 STEP 2: Visual Classification Scores
----------------------------------------
   Restaurant: 0.964
   Coffee: 0.907
   Bar: 0.966
   Fitness: 0.043
   Spor: 0.718
   Retail: 0.108
   Romantic: 1.000
   Family: 0.264
   Trendy: 0.619
   Casual: 0.890
   Upscale: 0.990

📊 VISUAL SCORES SUMMARY:
   🏆 Romantic: 1.0
   🏆 Upscale: 0.99
   🏆 Bar: 0.966
   🏆 Restaurant: 0.964
   🏆 Coffee: 0.907


In [7]:
print(f"\n🔍 STEP 3: Text Classification Scores")
print("-" * 40)

# Define text labels for classification
text_labels = [
    "restaurant serving food and meals",
    "coffee shop or cafe establishment", 
    "bar serving alcoholic drinks",
    "fitness center or gym facility",
    "spa or beauty salon services",
    "retail store selling products",
    "romantic atmosphere for couples",
    "family-friendly place for children",
    "trendy modern stylish venue",
    "casual relaxed informal setting",
    "upscale elegant expensive establishment"
]

text_results = {}

# Classify user text against each label
for label in text_labels:
    result = text_classifier(user_text, [label, "unrelated content"])
    
    # Get score for our target label
    if result['labels'][0] == label:
        score = result['scores'][0]
    else:
        score = result['scores'][1] if len(result['scores']) > 1 else 0.0
    
    # Clean label name
    clean_name = label.split()[0].title()
    if "romantic" in label:
        clean_name = "Romantic"
    elif "family" in label:
        clean_name = "Family"
    elif "trendy" in label:
        clean_name = "Trendy"
    elif "casual" in label:
        clean_name = "Casual"
    elif "upscale" in label:
        clean_name = "Upscale"
    
    text_results[clean_name] = round(score, 3)
    print(f"   {clean_name}: {score:.3f}")

print(f"\n📊 TEXT SCORES SUMMARY:")
sorted_text = sorted(text_results.items(), key=lambda x: x[1], reverse=True)
for name, score in sorted_text[:5]:
    print(f"   🏆 {name}: {score}")


🔍 STEP 3: Text Classification Scores
----------------------------------------
   Restaurant: 0.988
   Coffee: 0.221
   Bar: 0.429
   Fitness: 0.174
   Spa: 0.192
   Retail: 0.190
   Romantic: 0.980
   Family: 0.241
   Trendy: 0.676
   Casual: 0.607
   Upscale: 0.859

📊 TEXT SCORES SUMMARY:
   🏆 Restaurant: 0.988
   🏆 Romantic: 0.98
   🏆 Upscale: 0.859
   🏆 Trendy: 0.676
   🏆 Casual: 0.607


In [8]:
print(f"\n🔍 STEP 4: Enhanced Text Analysis (Text + Image Description)")
print("-" * 40)

# Combine user text with image description for better context
enhanced_text = f"{user_text}. The image shows: {image_description}"
print(f"🔗 Enhanced text: '{enhanced_text}'")

enhanced_text_results = {}

# Classify enhanced text
for label in text_labels:
    result = text_classifier(enhanced_text, [label, "unrelated content"])
    
    if result['labels'][0] == label:
        score = result['scores'][0]
    else:
        score = result['scores'][1] if len(result['scores']) > 1 else 0.0
    
    # Clean label name (same as before)
    clean_name = label.split()[0].title()
    if "romantic" in label:
        clean_name = "Romantic"
    elif "family" in label:
        clean_name = "Family"
    elif "trendy" in label:
        clean_name = "Trendy"
    elif "casual" in label:
        clean_name = "Casual"
    elif "upscale" in label:
        clean_name = "Upscale"
    
    enhanced_text_results[clean_name] = round(score, 3)
    print(f"   {clean_name}: {score:.3f}")

print(f"\n📊 ENHANCED TEXT SCORES SUMMARY:")
sorted_enhanced = sorted(enhanced_text_results.items(), key=lambda x: x[1], reverse=True)
for name, score in sorted_enhanced[:5]:
    print(f"   🏆 {name}: {score}")


🔍 STEP 4: Enhanced Text Analysis (Text + Image Description)
----------------------------------------
🔗 Enhanced text: 'Amazing romantic dinner! #restaurant #romantic #datenight. The image shows: romantic couple in restaurant'
   Restaurant: 0.983
   Coffee: 0.183
   Bar: 0.304
   Fitness: 0.133
   Spa: 0.170
   Retail: 0.150
   Romantic: 0.996
   Family: 0.263
   Trendy: 0.547
   Casual: 0.631
   Upscale: 0.772

📊 ENHANCED TEXT SCORES SUMMARY:
   🏆 Romantic: 0.996
   🏆 Restaurant: 0.983
   🏆 Upscale: 0.772
   🏆 Casual: 0.631
   🏆 Trendy: 0.547


In [9]:
print(f"\n🔗 STEP 5: Multi-Modal Fusion")
print("-" * 40)

# Get common categories
common_categories = set(visual_results.keys()) & set(enhanced_text_results.keys())

fusion_results = {}

print("Individual scores and fusion:")
for category in common_categories:
    visual_score = visual_results.get(category, 0)
    text_score = enhanced_text_results.get(category, 0)
    
    # Simple weighted fusion (60% text, 40% visual)
    fused_score = 0.6 * text_score + 0.4 * visual_score
    
    fusion_results[category] = {
        'visual': visual_score,
        'text': text_score, 
        'fused': round(fused_score, 3),
        'prediction': 'YES' if fused_score > 0.4 else 'NO'
    }
    
    print(f"   {category}:")
    print(f"      Visual: {visual_score:.3f}")
    print(f"      Text: {text_score:.3f}")
    print(f"      Fused: {fused_score:.3f} → {fusion_results[category]['prediction']}")

print(f"\n🎯 FINAL FUSION RESULTS")
print("=" * 50)

# Show positive predictions
positive_predictions = []
for category, scores in fusion_results.items():
    if scores['prediction'] == 'YES':
        positive_predictions.append((category, scores['fused']))

if positive_predictions:
    print("✅ DETECTED FEATURES:")
    for category, score in sorted(positive_predictions, key=lambda x: x[1], reverse=True):
        print(f"   🏆 {category}: {score}")
else:
    print("❌ No features detected above threshold (0.4)")

print(f"\n📊 ALL SCORES (Top 5):")
sorted_final = sorted(fusion_results.items(), key=lambda x: x[1]['fused'], reverse=True)
for category, scores in sorted_final[:5]:
    status = "✅" if scores['prediction'] == 'YES' else "❌"
    print(f"   {status} {category}: {scores['fused']} (V:{scores['visual']}, T:{scores['text']})")


🔗 STEP 5: Multi-Modal Fusion
----------------------------------------
Individual scores and fusion:
   Casual:
      Visual: 0.890
      Text: 0.631
      Fused: 0.735 → YES
   Family:
      Visual: 0.264
      Text: 0.263
      Fused: 0.263 → NO
   Retail:
      Visual: 0.108
      Text: 0.150
      Fused: 0.133 → NO
   Romantic:
      Visual: 1.000
      Text: 0.996
      Fused: 0.998 → YES
   Upscale:
      Visual: 0.990
      Text: 0.772
      Fused: 0.859 → YES
   Restaurant:
      Visual: 0.964
      Text: 0.983
      Fused: 0.975 → YES
   Fitness:
      Visual: 0.043
      Text: 0.133
      Fused: 0.097 → NO
   Trendy:
      Visual: 0.619
      Text: 0.547
      Fused: 0.576 → YES
   Coffee:
      Visual: 0.907
      Text: 0.183
      Fused: 0.473 → YES
   Bar:
      Visual: 0.966
      Text: 0.304
      Fused: 0.569 → YES

🎯 FINAL FUSION RESULTS
✅ DETECTED FEATURES:
   🏆 Romantic: 0.998
   🏆 Restaurant: 0.975
   🏆 Upscale: 0.859
   🏆 Casual: 0.735
   🏆 Trendy: 0.576
   🏆 Bar: 

In [10]:
print(f"\n📈 STEP 6: Method Comparison")
print("-" * 40)

print("How each method performed:")
print("\n1️⃣ VISUAL ONLY (CLIP + Image):")
for name, score in sorted_visual[:3]:
    print(f"   {name}: {score}")

print("\n2️⃣ TEXT ONLY (User text):")
for name, score in sorted_text[:3]:
    print(f"   {name}: {score}")

print("\n3️⃣ ENHANCED TEXT (User text + Image description):")
for name, score in sorted_enhanced[:3]:
    print(f"   {name}: {score}")

print("\n4️⃣ FUSION (60% Enhanced Text + 40% Visual):")
for category, scores in sorted_final[:3]:
    print(f"   {category}: {scores['fused']}")

print(f"\n💡 INSIGHTS:")
print("- Visual encoder good for: scene understanding, ambience detection")
print("- Text encoder good for: explicit mentions, hashtag analysis") 
print("- Enhanced text good for: combining user intent with visual context")
print("- Fusion good for: balanced understanding of both modalities")


📈 STEP 6: Method Comparison
----------------------------------------
How each method performed:

1️⃣ VISUAL ONLY (CLIP + Image):
   Romantic: 1.0
   Upscale: 0.99
   Bar: 0.966

2️⃣ TEXT ONLY (User text):
   Restaurant: 0.988
   Romantic: 0.98
   Upscale: 0.859

3️⃣ ENHANCED TEXT (User text + Image description):
   Romantic: 0.996
   Restaurant: 0.983
   Upscale: 0.772

4️⃣ FUSION (60% Enhanced Text + 40% Visual):
   Romantic: 0.998
   Restaurant: 0.975
   Upscale: 0.859

💡 INSIGHTS:
- Visual encoder good for: scene understanding, ambience detection
- Text encoder good for: explicit mentions, hashtag analysis
- Enhanced text good for: combining user intent with visual context
- Fusion good for: balanced understanding of both modalities


In [2]:
from ultralytics import YOLO
from PIL import Image
import matplotlib.pyplot as plt

# Load the pre-trained YOLOv8 model (you can try 'yolov8n.pt' for faster results)
model = YOLO('yolov8m.pt')

# Path to your image
image_path = r"C:\Users\AbdulRahman\Desktop\grad_folders\MAIN_GRAD\Travia\Implementation\AI\test\test_image_luxury_spa.jpg"

# Run inference
results = model(image_path)

# Show the image with bounding boxes (optional)
results[0].show()

# Or save the image with results
results[0].save(filename="output_with_boxes.jpg")

# Get detected object names
detected_objects = results[0].names
detected_ids = results[0].boxes.cls.cpu().tolist()  # get class IDs

# Map class IDs to object names
object_labels = [detected_objects[int(cls_id)] for cls_id in detected_ids]

# Print out detected objects
print("🧠 Detected objects:")
for label in set(object_labels):
    print(f" - {label}")


image 1/1 C:\Users\AbdulRahman\Desktop\grad_folders\MAIN_GRAD\Travia\Implementation\AI\test\test_image_luxury_spa.jpg: 480x640 1 bicycle, 2 chairs, 341.4ms
Speed: 14.0ms preprocess, 341.4ms inference, 10.4ms postprocess per image at shape (1, 3, 480, 640)
🧠 Detected objects:
 - bicycle
 - chair


In [4]:
!python --version

Python 3.11.5
