In [None]:
# Install required packages
!pip install transformers 
!pip install einops 
!pip install torchvision 
!pip install torch
!pip install pillow 
!pip install accelerate 
!pip install ipywidgets

In [None]:
# Import necessary libraries
import os
import json
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image

In [None]:
# Load the processor
processor = AutoProcessor.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

# load the model
model = AutoModelForCausalLM.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

In [None]:
# Specify the folder containing the images
image_folder = 'photos'

# Initialize a dictionary to store results
results = {}

In [None]:
# Define function to process a single image
def process_image(image_path):
    inputs = processor.process(
        images=[Image.open(image_path)],
        text = """Analyze and describe the image in detail. Address the following aspects:

1. Main Subject: Identify the primary focus or theme of the image.
2. Visual Details: Describe key colors, shapes, and significant visual elements.
3. Text Content: Summarize any visible text, highlighting main topics and important points.
4. Layout and Structure: Outline how elements are arranged and any notable patterns.
5. Symbols and Legends: Explain symbols, legends, or keys present in the image.
6. Purpose and Context: Infer the likely purpose, audience, or context of the image.

Provide a comprehensive description covering all relevant details observed in the image."""
    )
    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
    with torch.autocast(device_type="cpu", enabled=True, dtype=torch.bfloat16):
          output = model.generate_from_batch(
          inputs,
          GenerationConfig(max_new_tokens=2000, stop_strings="<|endoftext|>"),
          tokenizer=processor.tokenizer
    )
    generated_tokens = output[0,inputs['input_ids'].size(1):]
    return processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

In [None]:
# Process all images in the folder
for filename in os.listdir(image_folder):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
        image_path = os.path.join(image_folder, filename)
        results[filename] = process_image(image_path)
        print(f"Processed: {filename}")

In [None]:
# Save results to a JSON file
with open('image_descriptions.json', 'w') as f:
    json.dump(results, f, indent=4)

print("All images processed. Results saved to 'image_descriptions.json'")