In [None]:
# !python3 -m pip install -U git+https://github.com/facebookresearch/audiocraft#egg=audiocraft
# !pip install -q openai
# !pip install git+https://github.com/huggingface/transformers.git
# !pip install datasets
# !pip install fiftyone

In [None]:
# !pip install langchain pymongo bs4 openai tiktoken gradio requests lxml argparse unstructured

In [None]:
import torchaudio
import IPython
import numpy as np
import openai
import transformers

from IPython.display import Audio
from audiocraft.models import musicgen
from audiocraft.utils.notebook import display_audio
import torch
import base64
import requests
import torch.nn as nn

In [None]:
from pymongo import MongoClient
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.document_loaders import DirectoryLoader
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
import gradio as gr
from gradio.themes.base import Base
from tqdm import tqdm
from openai import OpenAI
import pandas as pd

In [None]:
import fiftyone as fo
import fiftyone.zoo as foz
from datasets import load_dataset
from transformers import AutoProcessor, ClapModel, ClapProcessor, AutoFeatureExtractor, ClapTextModelWithProjection, AutoTokenizer
import soundfile as sf
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import librosa

In [None]:
open_ai_key = ""
mongo_uri = ""

In [None]:
df = pd.read_csv('/content/musiccaps_caption.csv')
client = OpenAI(api_key = open_ai_key)

# MusicGen

In [None]:
model = musicgen.MusicGen.get_pretrained('medium', device='cuda')
model.set_generation_params(duration=8)

def get_music(prompt):
    res = model.generate([
    prompt],
    progress=True)
    return res

# Exctracting Context Aware Examples

In [None]:
mongo_client = MongoClient(mongo_uri)
dbName = "MLProj_keywords"
collectionName = "collection_of_keywords_blobs"
collection = mongo_client[dbName][collectionName]

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=open_ai_key)

In [None]:
vectorStore = MongoDBAtlasVectorSearch( collection, embeddings )

In [None]:
def query_data(query):
    # Convert question to vector using OpenAI embeddings
    # Perform Atlas Vector Search using Langchain's vectorStore
    # similarity_search returns MongoDB documents most similar to the query

    docs = vectorStore.similarity_search(query) #, index_name = 'keywordsSearch')
    # print(docs)
    as_outputs = []
    rows = []
    for doc in docs:
        as_output = doc.page_content
        row = doc.metadata['row']
        as_outputs.append(as_output)
        rows.append(row)

    return as_outputs, rows

# Get Keywords from OpenAI

In [None]:
def get_keywords(user_prompt):

    completion = client.chat.completions.create(
      model="gpt-4",
      messages=[
        {"role": "system", "content": "Please provide 10 keywords that capture the thematic elements and emotional essence of the given text, suitable for inspiring music composition. The keywords should evoke the main themes and emotions, including one keyword related to the text's specific subject (like a nightclub, gaming, sports, movie, or religious). Avoid nouns, locations, specific or technical terms, focusing on broader themes and the overall mood. Output only the keywords separated by comma."},
        {"role": "user", "content": user_prompt}
      ]
    )

    # print(completion.choices[0].message)
    return completion.choices[0].message.content

# Generating Music From Input Text

In [None]:
# def get_keywords_text(user_prompt):

#     completion = client.chat.completions.create(
#       model="gpt-4",
#       messages=[
#         {"role": "system", "content": "Please provide 10 keywords that capture the thematic elements and emotional essence of the article, suitable for inspiring music composition. The keywords should evoke the main themes and emotions, including one keyword related to the article’s specific subject (like a nightclub, gaming, sports, movie or religious). Avoid nouns, locations, specific or technical terms, focusing on broader themes and the overall mood. Output only the keywords separated by comma."},
#         {"role": "user", "content": user_prompt}
#       ]
#     )

#     # print(completion.choices[0].message)
#     return completion.choices[0].message.content

In [None]:
more_details = "The model will generate 15 seconds of audio based on the description you provided. The model was trained with description from a stock music catalog, descriptions that will work best should include some level of details on the instruments present, along with some intended use case (e.g. adding “perfect for a commercial” can somehow help)."

def get_prompts(user_prompt):

    keywords_query = get_keywords(user_prompt)
    keywords_outputs, keywords_rows = query_data(keywords_query)

    examples = []
    for row in keywords_rows:
      examples.append(df['caption'][row])

    completion = client.chat.completions.create(
      model="gpt-4",
      messages=[
        {"role": "system", "content": f"You are a music generation assistant, which helps to write prompts for MusicGen which is a text-to-music generation model. Your task is to take in the user input text provided to you and understand the context/emotion of it and generate an appropriate prompt for MusicGen, by appropriate I mean that the prompt should only be concerned about the musical details and should not mention any details regarding the user input. More Details: {more_details} Only output the prompt and nothing else. Some example prompts for you to understand the kind of language MusicGen expects: 1. {examples[0]} 2. {examples[1]}, 3. {examples[2]}, 4. {examples[3]}"},
        {"role": "user", "content": user_prompt}
      ]
    )

    # print(completion.choices[0].message)
    return completion.choices[0].message.content #, examples

In [None]:
def get_music_from_text(user_prompt):
    music_gen_prompt = get_prompts(user_prompt)
    res = get_music(music_gen_prompt)
    return res[0]

In [None]:
# query = "A free retro Nintendo game emulator is now live on the Apple App Store, and because of recent changes made to Apple's app review guidelines, it's actually permitted to be on there. Whether Nintendo will take kindly to it is another matter, given its stance that supporting emulation also supports the illegal piracy of our products. In case you missed it, earlier this week, Apple made a tweak to its guidelines surrounding mini apps, mini-games, streaming games, chatbots, plug-ins, and game emulators, which can be found under point 4.7 of its guidelines. From now on, retro game console emulator apps can offer to download games, although developers are responsible for all such software offered in your app, including ensuring that such software complies with these Guidelines and all applicable laws. As such, emulators are officially allowed to be on the App Store, and Delta, developed by Riley Testut, is one of them. It's not the first Nintendo game emulator to be released following the rule changes, as a Nintendo Entertainment System emulator named Bimmy emerged this week before it was pulled by its own developer out of fear (via The Verge)."
# music_from_text = get_music_from_text(query)

In [None]:
# display_audio(music_from_text, 32000)

# Generating Music from Audio

In [None]:
more_details = "The model will generate 15 seconds of audio based on the description you provided. The model was trained with description from a stock music catalog, descriptions that will work best should include some level of details on the instruments present, along with some intended use case (e.g. adding “perfect for a commercial” can somehow help)."

def get_prompts_audio(user_audio_path):

    with open(user_audio_path, "rb") as audio_file:
      # Use the audio.transcriptions.create method to transcribe the audio file
      user_prompt = client.audio.transcriptions.create(
          model="whisper-1",
          file=audio_file,
          response_format="text"
      )

    keywords_query = get_keywords(user_prompt)
    keywords_outputs, keywords_rows = query_data(keywords_query)

    examples = []
    for row in keywords_rows:
      examples.append(df['caption'][row])

    completion = client.chat.completions.create(
      model="gpt-4",
      messages=[
        {"role": "system", "content": f"You are a music generation assistant, which helps to write prompts for MusicGen which is a text-to-music generation model. Your task is to take in the user input text provided to you and understand the context/emotion of it and generate an appropriate prompt for MusicGen, by appropriate I mean that the prompt should only be concerned about the musical details and should not mention any details regarding the user input. More Details: {more_details} Only output the prompt and nothing else. Some example prompts for you to understand the kind of language MusicGen expects: 1. {examples[0]} 2. {examples[1]}, 3. {examples[2]}, 4. {examples[3]}"},
        {"role": "user", "content": user_prompt}
      ]
    )

    # print(completion.choices[0].message)
    return completion.choices[0].message.content #, examples

In [None]:
def get_music_from_audio(user_audio_path):
    music_gen_prompt = get_prompts_audio(user_audio_path)
    res = get_music(music_gen_prompt)
    return res[0]

In [None]:
# audio_path = ""
# music_from_audio = get_music_from_audio(audio_path)

In [None]:
# display_audio(music_from_audio, 32000)

# Generating Music from Image

In [None]:
# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [None]:
more_details = "The model will generate 15 seconds of audio based on the description you provided. The model was trained with description from a stock music catalog, descriptions that will work best should include some level of details on the instruments present, along with some intended use case (e.g. adding “perfect for a commercial” can somehow help)."

def get_prompts_image(user_image_path, random_sample = False):

    ################################################
    # Get user_prompt from image

    # Getting the base64 string
    base64_image = encode_image(user_image_path)

    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {open_ai_key}"
    }

    payload = {
      "model": "gpt-4-turbo",
      "messages": [
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": "What’s in this image? Try to describe in a way that we get to understand the context and the emotion of the image."
            },
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
              }
            }
          ]
        }
      ],
      "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    user_prompt = response.json()['choices'][0]['message']['content']

    ################################################

    examples = []
    if random_sample:
      for row in df.sample(n=4).index:
        examples.append(df['caption'][row])
    else:
      keywords_query = get_keywords(user_prompt)
      keywords_outputs, keywords_rows = query_data(keywords_query)
      for row in keywords_rows:
        examples.append(df['caption'][row])

    completion = client.chat.completions.create(
      model="gpt-4",
      messages=[
        {"role": "system", "content": f"You are a music generation assistant, which helps to write prompts for MusicGen which is a text-to-music generation model. Your task is to take in the user input text provided to you and understand the context/emotion of it and generate an appropriate prompt for MusicGen, by appropriate I mean that the prompt should only be concerned about the musical details and should not mention any details regarding the user input. More Details: {more_details} Only output the prompt and nothing else. Some example prompts for you to understand the kind of language MusicGen expects: 1. {examples[0]} 2. {examples[1]}, 3. {examples[2]}, 4. {examples[3]}"},
        {"role": "user", "content": user_prompt}
      ]
    )

    # print(completion.choices[0].message)
    return completion.choices[0].message.content #, examples

In [None]:
def get_music_from_image(user_image_path, random_sample = False):
    music_gen_prompt = get_prompts_image(user_image_path, random_sample)
    res = get_music(music_gen_prompt)
    return res[0], music_gen_prompt

In [None]:
# music_from_image, music_gen_prompt = get_music_from_image('/content/download.jpeg')

In [None]:
# display_audio(music_from_image, 32000)

# MS-COCO Dataset to Check CLAP Scores on

In [None]:
dataset = foz.load_zoo_dataset(
    "coco-2017",
    split="validation",
    # label_types=["segmentations"],
    # classes=["cat", "dog"],
    max_samples=50,
)

In [None]:
clap_model = ClapModel.from_pretrained("laion/larger_clap_general") #.to(0)
clap_processor = AutoFeatureExtractor.from_pretrained("laion/larger_clap_general")
clap_tokenizer = AutoTokenizer.from_pretrained("laion/larger_clap_general")

In [None]:
# Specify the directory path
directory_path = '/root/fiftyone/coco-2017/validation/data/'

count = 0

cos_scores_context = []
cos_scores_random = []

# Loop through each file in the directory
for filename in tqdm(sorted(os.listdir(directory_path))):
    if filename.endswith('.jpg'):
        print("\n#################################################")
        print("Count : ", count)
        print("filename : ",filename)
        # Create the full path by joining the directory path and the filename
        full_path = os.path.join(directory_path, filename)

        # Load the image from the file
        img = mpimg.imread(full_path)

        # Display the image
        plt.imshow(img)
        plt.title(f"Displaying: {filename}")
        plt.axis('off')  # Turn off axis numbers and ticks
        plt.show()

        # Context
        print("\nContext : ")
        music_gen_context, music_prompt_context = get_music_from_image(full_path)
        display_audio(music_gen_context, 32000)
        sf.write(f'/content/music/output_context_{count}.wav', music_gen_context.cpu().numpy()[0], 32000)
        print("music_prompt_context : ",music_prompt_context)

        text_inputs_context = clap_tokenizer(music_prompt_context, padding=True, return_tensors="pt")
        resampled_audio_context = librosa.resample(music_gen_context.cpu().numpy()[0], orig_sr=32000, target_sr=48000)
        audio_inputs_context = clap_processor(torch.tensor(resampled_audio_context), return_tensors="pt")
        text_features_context = clap_model.get_text_features(**text_inputs_context)
        audio_features_context = clap_model.get_audio_features(**audio_inputs_context)

        cos = nn.CosineSimilarity(dim=0, eps=1e-6)
        cos_score = cos(text_features_context.reshape(-1), audio_features_context.reshape(-1))
        cos_scores_context.append(cos_score.detach().numpy())
        print(f"\ncos_score : ", cos_score)
        print(f"mean cos_score : ", np.mean(cos_scores_context))

        # Random
        print("\nRandom : ")
        music_gen_random, music_prompt_random = get_music_from_image(full_path, True)
        display_audio(music_gen_random, 32000)
        sf.write(f'/content/music/output_random_{count}.wav', music_gen_random.cpu().numpy()[0], 32000)
        print("music_prompt_random : ",music_prompt_random)

        text_inputs_random = clap_tokenizer(music_prompt_random, padding=True, return_tensors="pt")
        resampled_audio_random = librosa.resample(music_gen_random.cpu().numpy()[0], orig_sr=32000, target_sr=48000)
        audio_inputs_random = clap_processor(torch.tensor(resampled_audio_random), return_tensors="pt")
        text_features_random = clap_model.get_text_features(**text_inputs_random)
        audio_features_random = clap_model.get_audio_features(**audio_inputs_random)

        cos = nn.CosineSimilarity(dim=0, eps=1e-6)
        cos_score = cos(text_features_random.reshape(-1), audio_features_random.reshape(-1))
        cos_scores_random.append(cos_score.detach().numpy())
        print(f"\ncos_score : ", cos_score)
        print(f"mean cos_score : ", np.mean(cos_scores_random))

        count += 1
        # if count == 50:
        #   break

In [None]:
# # Save to an audio file
# sf.write('/content/output.wav', music_gen.cpu().numpy()[0], 32000)