In [None]:
pip install --upgrade openai

In [None]:
pip install safetensors

In [2]:
import torch
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    MusicgenForConditionalGeneration,
    AutoProcessor,
    pipeline,
    AutoTokenizer,
    AutoModelForTextToWaveform
)
from PIL import Image
import torchaudio
import openai

In [None]:
client = openai.OpenAI(api_key="")

In [None]:
from huggingface_hub import login
login(token="")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [21]:
def generate_image_caption(image_path):
    # Set the path to your fine-tuned model
    model_path = "/content/drive/MyDrive/fine-tuned-blip-model/model"

    # Load the processor from Hugging Face
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

    # Load your fine-tuned BLIP model
    model = BlipForConditionalGeneration.from_pretrained(model_path)

    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)  # Move inputs to the same device

    # Generate caption
    with torch.no_grad():
        caption_ids = model.generate(**inputs)

    caption = processor.batch_decode(caption_ids, skip_special_tokens=True)[0]
    return caption

In [7]:
# Corrected GPT-4 function with latest API
def refine_caption_gpt4(food_caption):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an AI food writer that enhances food descriptions with expressive, mouthwatering, and sensory-rich language for audio generation. Always add adjectives like 'tasty', 'savory', 'sweet', 'crispy', 'juicy', 'sizzling', 'buttery', 'rich', or 'delicious'."},
            {"role": "user", "content": f"Convert this food description into an expressive audio caption: {food_caption}"}
        ],
        max_tokens=60
    )

    refined_caption = response.choices[0].message.content.strip()
    return refined_caption

In [8]:
# Function to generate audio from caption
def generate_audio(caption, output_path="output_audio.wav"):
    # Load MusicGen model
    tokenizer = AutoTokenizer.from_pretrained("csc-unipd/tasty-musicgen-small")
    model = AutoModelForTextToWaveform.from_pretrained("csc-unipd/tasty-musicgen-small")

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device)

    print(f"\n🎵 Generating music for caption: {caption}")

    # Process input text for MusicGen
    inputs = tokenizer(caption, return_tensors="pt").to(device)

    # Generate music
    with torch.no_grad():
        audio_output = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=256)

    # Save audio file
    torchaudio.save(output_path, audio_output[0].cpu(), sample_rate=32000)
    print(f"✅ Music saved: {output_path}")

In [9]:
# Main function to integrate all steps
def main(image_path, output_audio_path="output_audio.wav"):
    print("Generating food caption from image...")
    food_caption = generate_image_caption(image_path)
    print(f"Generated Food Caption: {food_caption}")

    print("Refining caption using GPT-4...")
    refined_caption = refine_caption_gpt4(food_caption)
    print(f"GPT-4 Refined Caption: {refined_caption}")

    print("Generating audio from refined caption...")
    generate_audio(refined_caption, output_audio_path)
    print("Process complete.")

In [18]:
# Example usage
if __name__ == "__main__":
    image_path = "test_002572.jpg"  # Replace with your image file path
    main(image_path)

Generating food caption from image...
Generated Food Caption: a plate of sushi with shrimp, shrimp, and other sushi
Refining caption using GPT-4...
GPT-4 Refined Caption: "Feast your ears on this, we have before us, a vibrant plate piled with succulent sushi. Highlighting plump, briny shrimp that command center stage, each piece is lovingly glazed with a glossy sheen. Delicate, silky ribbons of the freshest catch of the
Generating audio from refined caption...

🎵 Generating music for caption: "Feast your ears on this, we have before us, a vibrant plate piled with succulent sushi. Highlighting plump, briny shrimp that command center stage, each piece is lovingly glazed with a glossy sheen. Delicate, silky ribbons of the freshest catch of the
✅ Music saved: output_audio.wav
Process complete.
