### **Dependencies**

In [1]:
# !pip install einops
# !pip install flashy
# !pip install hydra-core
# !pip install hydra-colorlog
# !pip install julius
# !pip install num2words
# !pip install numpy
# !pip install sentencepiece
# !pip install spacy==3.7.2
!pip install thinc==8.2.3
# !pip install torch
# !pip install torchaudio
# !pip install huggingface_hub
# !pip install tqdm
# !pip install transformers
# !pip install xformers
# !pip install demucs
# !pip install librosa
# !pip install gradio
# !pip install torchmetrics
!pip install encodec



[0m

# **Iterative Code**

In [4]:
import torch
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from IPython.display import Audio, display

def generate_audio(inp, model, duration):
    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device)
    sampling_rate = model.config.audio_encoder.sampling_rate
    inputs = processor(
        text=inp,
        padding=True,
        return_tensors="pt",
    )


    max_duration_tokens = int(duration * sampling_rate)

    print("Generating audio...")
    with torch.no_grad():
        audio_values = model.generate(
            **inputs.to(device),
            # do_sample=True,
            # guidance_scale=3,
            max_new_tokens=64
        )

    audio_data = audio_values[0, 0].cpu().numpy()
    return audio_data, sampling_rate

def process_input(user_input, artist_input, model, duration):
    print("User input:", user_input)
    print("Artist input:", artist_input)
    print("Duration:", duration)

    s = ""
    if user_input:
        s += user_input + ", "
    if artist_input:
        s += artist_input

    print("Combined input:", s)
    if s and duration:
        audio_data, sampling_rate = generate_audio(s, model, float(duration))
        return audio_data, sampling_rate
    else:
        print("Invalid input")
        return None, None

def main():
    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

    user_input = input("Enter user input: ")
    artist_input = input("Enter artist input: ")
    duration_input = input("Enter tune duration (in seconds): ")

    audio_data, sampling_rate = process_input(user_input, artist_input, model, duration_input)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        # Display audio using IPython.display.Audio
        display(Audio(audio_data, rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


Enter user input: asdadasd
Enter artist input: fasfsdfs
Enter tune duration (in seconds): 1
User input: asdadasd
Artist input: fasfsdfs
Duration: 1
Combined input: asdadasd, fasfsdfs


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 1.06 MiB is free. Process 1236 has 5.32 GiB memory in use. Process 6492 has 6.14 GiB memory in use. Process 62859 has 3.29 GiB memory in use. Of the allocated memory 3.06 GiB is allocated by PyTorch, and 100.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
%cd /content/sample_data/text-tune-ai

In [None]:
pwd

In [None]:
!git clone https://github.com/facebookresearch/audiocraft/


In [None]:
pwd

In [None]:
%cd /content/sample_data/text-tune-ai


In [None]:
  pip install -r requirements.txt

In [None]:
!pip install audiocraft

In [None]:
pwd

In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_input, artist_input, model, duration):
    print("User input:", user_input)
    print("Artist input:", artist_input)
    print("Duration:", duration)
    sr = 32000
    s = ""
    if user_input:
        s += user_input + ", "
    if artist_input:
        s += artist_input

    print("Combined input:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid input")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-small')

    user_input = input("Enter user input: ")
    artist_input = input("Enter your theme/tune mood: ")
    duration_input = input("Enter tune duration (in seconds): ")

    audio_data,sampling_rate  = process_input(user_input, artist_input, model, duration_input)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        # Display audio using IPython.display.Audio
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=32000))
        #display(Audio(audio_data, rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    sr = 32000
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined input:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid input")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        # Display audio using IPython.display.Audio
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=32000))
        #display(Audio(audio_data, rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
pwd

In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-stereo-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
!pip install openai

In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-stereo-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
pip install --upgrade openai

In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display
import openai


def query_gpt(user_prompt, theme):
    openai.api_key = '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'
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are a music expert, skilled in explaining intricacies in music vibe with contextual flair."},
                {"role": "user", "content": f"I am trying to get a highlevel description for {user_prompt} vibe of music for {theme} purpose. Can you please give me that one line music vibe explaining the rhythm."}
            ]
        )
        print(f"GPT Response: {response}")
        #print(response['choices'][0]['message']['content'])
        #return response['choices'][0]['message']['content']
        if response.choices:
            choice = response.choices[0]
            if choice.finish_reason == "stop":
                message = choice.message
                content = message.content
                print("Content:", content)
                return content
        else:
            print("No choices found in the response")
            return ""

    except Exception as e:
        # Handle any exception that occurs during the OpenAI API request
        print(f"An error occurred during the OpenAI API request: {e}")
        return ""

def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if (theme != ""):
        res = query_gpt(user_prompt,theme)
        if (res != ""):
            s += res


    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-stereo-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-stereo-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-stereo-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
%cd /content/sample_data/text-tune-ai
!git clone https://github.com/facebookresearch/audiocraft/
%cd /content/sample_data/text-tune-ai
!pip install audiocraft

In [None]:
pip install -r requirements.txt

CLAP

In [None]:
import torch, torchaudio, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-stereo-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 48000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
import torch, IPython
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from audiocraft.models import MusicGen
from IPython.display import Audio, display


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-stereo-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 48000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
! git clone https://github.com/mikexue7/audiocraft.git

In [None]:
import IPython
from audiocraft.models import MusicGen
from IPython.display import display
from scipy.io import wavfile


def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme:
        s += theme

    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
        wavfile.write("/content/sample_data/text-tune-ai/output/download2.wav", rate=sampling_rate, data=audio_data[0, 0].cpu().numpy())
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
!pip install openai

In [None]:
import IPython
from audiocraft.models import MusicGen
from IPython.display import display
from scipy.io import wavfile
import openai


def query_gpt(user_prompt, theme):
    openai.api_key = 'Enter the Key'
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are a music expert, skilled in explaining intricacies in music vibe with contextual flair."},
                {"role": "user", "content": f"I am trying to get a highlevel description for {user_prompt} vibe of music for {theme} purpose. Can you please give me that one line music vibe explaining the rhythm."}
            ]
        )
        print(f"GPT Response: {response}")
        #print(response['choices'][0]['message']['content'])
        #return response['choices'][0]['message']['content']
        if response.choices:
            choice = response.choices[0]
            if choice.finish_reason == "stop":
                message = choice.message
                content = message.content
                print("Content:", content)
                return content
        else:
            print("No choices found in the response")
            return ""

    except Exception as e:
        # Handle any exception that occurs during the OpenAI API request
        print(f"An error occurred during the OpenAI API request: {e}")
        return ""

def generate_music_tensors(prompt, model, duration,sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def process_input(user_prompt, theme, model, duration,sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if (theme != ""):
        res = query_gpt(user_prompt,theme)
        if (res != ""):
            s += res


    print("Combined prompt:", s)
    if s and duration:
        #audio_data, sampling_rate = generate_audio(s, model, float(duration))
        music_tensors = generate_music_tensors(s, model,duration,sr)
        return music_tensors, sr
    else:
        print("Invalid prompt")
        return None, None

def main():
    #model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    model = MusicGen.get_pretrained('facebook/musicgen-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data,sampling_rate  = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and sampling_rate is not None:
        print("Audio file generated")
        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
        wavfile.write("/content/sample_data/text-tune-ai/output/download3.wav", rate=sampling_rate, data=audio_data[0, 0].cpu().numpy())
    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
import IPython
from audiocraft.models import MusicGen
from IPython.display import display
from scipy.io import wavfile
import openai
import numpy as np
import librosa

def query_gpt(user_prompt, theme):
    openai.api_key = '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'
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are a music expert, skilled in explaining intricacies in music vibe with contextual flair."},
                {"role": "user", "content": f"I am trying to get a highlevel description for {user_prompt} vibe of music for {theme} purpose. Can you please give me that one line music vibe explaining the rhythm."}
            ]
        )
        print(f"GPT Response: {response}")
        if response.choices:
            choice = response.choices[0]
            if choice.finish_reason == "stop":
                message = choice.message
                content = message.content
                print("Content:", content)
                return content
        else:
            print("No choices found in the response")
            return ""

    except Exception as e:
        print(f"An error occurred during the OpenAI API request: {e}")
        return ""

def generate_music_tensors(prompt, model, duration, sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def compute_fad_score(generated_audio, target_audio, sr):
    gen_spec = np.abs(librosa.stft(generated_audio.squeeze().cpu().numpy()))
    target_spec = np.abs(librosa.stft(target_audio.squeeze()))
    fad_score = np.mean(np.abs(gen_spec - target_spec))
    return fad_score

def process_input(user_prompt, theme, model, duration, sr):
    print("User input:", user_prompt)
    print("Theme:", theme)
    print("Duration:", duration)
    s = ""
    if user_prompt:
        s += user_prompt + ", "
    if theme != "":
        res = query_gpt(user_prompt, theme)
        if res != "":
            s += res

    print("Combined prompt:", s)
    if s and duration:
        music_tensors = generate_music_tensors(s, model, duration, sr)
        return music_tensors, s
    else:
        print("Invalid prompt")
        return None, None

def main():
    model = MusicGen.get_pretrained('facebook/musicgen-small')

    user_prompt = input("Enter user input: ")
    theme = input("Enter your theme/tune mood: ")
    duration = input("Enter tune duration (in seconds): ")
    sampling_rate = 32000

    audio_data, theme_description = process_input(user_prompt, theme, model, duration, sampling_rate)

    if audio_data is not None and theme_description is not None:
        print("Audio file generated")

        IPython.display.display(IPython.display.Audio(audio_data.cpu().numpy().squeeze(), rate=sampling_rate))
        wavfile.write("/content/sample_data/text-tune-ai/output/download3.wav", rate=sampling_rate, data=audio_data[0, 0].cpu().numpy())
        # Assuming you have a target audio file for comparison
        target_audio, _ = librosa.load('/content/sample_data/text-tune-ai/output/download2.wav', sr=sampling_rate)

        # Calculate FAD Score
        fad_score = compute_fad_score(audio_data, target_audio, sampling_rate)
        print("FAD Score:", fad_score)

    else:
        print("There is something wrong with the audio generation")

if __name__ == "__main__":
    main()


In [None]:
pip install datasets

In [None]:
%cd /content/sample_data/text-tune-ai
!git clone https://github.com/facebookresearch/audiocraft/
%cd /content/sample_data/text-tune-ai
!pip install -r requirements.txt
! pip install datasets
!pip install audiocraft

In [None]:
from datasets import load_dataset
from audiocraft.models import MusicGen
from IPython.display import display
from scipy.io import wavfile
import numpy as np
import librosa

def generate_music_tensors(prompt, model, duration, sr):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=int(duration)
    )
    print("Your custom tune is under generation....")
    output = model.generate(
        descriptions=[prompt],
        progress=True,
        return_tokens=True
    )

    audio = output[0]
    return audio[:, :int(float(duration) * sr)]

def compute_fad_score(generated_audio, target_audio, sr):
    gen_spec = np.abs(librosa.stft(generated_audio.squeeze().cpu().numpy()))
    target_spec = np.abs(librosa.stft(target_audio.squeeze()))
    fad_score = np.mean(np.abs(gen_spec - target_spec))
    return fad_score

def get_caption_audio(musiccaps_data, prompt):
  filtered_data = musiccaps_data.filter(
      lambda example: example["caption"].strip().lower().find(prompt.lower()) != -1
  )

  caption = filtered_data["caption"][0]
  audio_path = filtered_data["audio_path"][0]

  audio, _ = librosa.load(audio_path, sr=32000)
  return caption, audio

def main():
    musiccaps_data = load_dataset("google/MusicCaps")

    num_prompts = 10
    model = MusicGen.get_pretrained('facebook/musicgen-small')
    duration = 10
    sampling_rate = 32000

    captions = musiccaps_data["train"]["caption"]
    prompts = captions[:num_prompts]

    all_fad_scores = []
    for prompt in prompts:
        music_tensors = generate_music_tensors(prompt, model, duration, sampling_rate)
        caption, target_audio = get_caption_audio(musiccaps_data, prompt)
        fad_score = compute_fad_score(music_tensors, target_audio, sampling_rate)
        all_fad_scores.append(fad_score)
        print(f"FAD Score for '{prompt}':", fad_score)

    if all_fad_scores:
        average_fad_score = sum(all_fad_scores) / len(all_fad_scores)
        print(f"Average FAD Score for {num_prompts} prompts:", average_fad_score)

if __name__ == "__main__":
    main()
