In [None]:
pip install --upgrade openai

In [None]:
import torch
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    MusicgenForConditionalGeneration,
    AutoProcessor
)
from PIL import Image
import torchaudio
import openai

In [None]:
client = openai.OpenAI(api_key="")

In [None]:
from huggingface_hub import login
login(token="")

In [None]:
# Function to generate initial caption from image
def generate_image_caption(image_path):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    caption_ids = model.generate(**inputs)
    caption = processor.batch_decode(caption_ids, skip_special_tokens=True)[0]

    return caption

In [None]:
# Corrected GPT-4 function with latest API
def refine_caption_gpt4(food_caption):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an AI designed to generate expressive, audio-friendly captions for food."},
            {"role": "user", "content": f"Convert this food description into an expressive audio caption: {food_caption}"}
        ],
        max_tokens=60
    )

    refined_caption = response.choices[0].message.content.strip()
    return refined_caption

In [None]:
# Function to generate audio from caption
def generate_audio(caption, output_path="output_audio.wav"):
    # Load MusicGen model
    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-medium")
    processor = AutoProcessor.from_pretrained("facebook/musicgen-medium")

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device)

    print(f"\n🎵 Generating music for caption: {caption}")

    # Process input text for MusicGen
    inputs = processor(text=[caption], return_tensors="pt")

    # Generate music
    with torch.no_grad():
        audio_output = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=256)

    # Save audio file
    torchaudio.save(output_path, audio_output[0].cpu(), sample_rate=32000)
    print(f"✅ Music saved: {output_path}")

In [None]:
# Main function to integrate all steps
def main(image_path, output_audio_path="output_audio.wav"):
    print("Generating food caption from image...")
    food_caption = generate_image_caption(image_path)
    print(f"Generated Food Caption: {food_caption}")

    print("Refining caption using GPT-4...")
    refined_caption = refine_caption_gpt4(food_caption)
    print(f"GPT-4 Refined Caption: {refined_caption}")

    print("Generating audio from refined caption...")
    generate_audio(refined_caption, output_audio_path)
    print("Process complete.")

In [None]:
# Example usage
if __name__ == "__main__":
    image_path = "test_001038.jpg"  # Replace with your image file path
    main(image_path)

Generating food caption from image...
Generated Food Caption: a group of macaron macarons macarons macarons macarons macaro
Refining caption using GPT-4...
GPT-4 Refined Caption: "Feast your senses on a delightful medley of vibrantly colored macarons! One taste of these cute little treats will whisk you straight to a charming Parisian bakery. Indulge in the sweet symphony of flavors concealed in every bite. Bon appétit!"
Generating audio from refined caption...


config.json:   0%|          | 0.00/7.87k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/8.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/8.04G [00:00<?, ?B/s]

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]


🎵 Generating music for caption: "Feast your senses on a delightful medley of vibrantly colored macarons! One taste of these cute little treats will whisk you straight to a charming Parisian bakery. Indulge in the sweet symphony of flavors concealed in every bite. Bon appétit!"
✅ Music saved: output_audio.wav
Process complete.
