In [1]:
import os
from pathlib import Path
import requests
from tqdm import tqdm
from typing import Union, List

import math
import torch
import numpy as np
import audiotools
import dasp_pytorch
import auraloss
# import laion_clap
from audiotools import AudioSignal

from transformers import BertForMaskedLM

from scipy import signal
import matplotlib.pyplot as plt

import helper

# dir(helper)

In [2]:
NOTEBOOKS_DIR = Path(os.path.abspath(''))
PROJECT_DIR = NOTEBOOKS_DIR.parent
ASSETS_DIR = PROJECT_DIR / "assets"
PRETRAINED_DIR = PROJECT_DIR / "pretrained"
DATA_DIR = PROJECT_DIR / "data"
RUNS_DIR = PROJECT_DIR / "runs"
EXPERIMENTS_DIR = PROJECT_DIR / "experiments"


EXP_AUDEALIZE_DIR = EXPERIMENTS_DIR / "audealize_comp"
EXPORT_EXAMPLES_DIR = Path(EXP_AUDEALIZE_DIR / "audealize_version")

print(ASSETS_DIR)

/home/annie/research/text2fx/assets


### Setting Up Audealize Ground Truth Files

##### Loading in Audealize API / word <> EQ settings

In [3]:
# Load JSON data Audealize API word <> EQ gain values
file_path = '/home/annie/research/text2fx/notebooks/audealize_data/eqdescriptors.json'
top10_eq = ["warm", "cold", "soft", "loud", "happy", "bright", "soothing", "harsh", "heavy", "cool"]
settings_dict = helper.get_settings_for_words(file_path, top10_eq)
# print(settings_dict['cool'])

# Zipping (frequency bands, gain vals) in dictionary
freq_bands = [20, 50, 83, 120, 161, 208, 259, 318, 383, 455, 537, 628, 729, 843, 971, 
              1114, 1273, 1452, 1652, 1875, 2126, 2406, 2719, 3070, 3462, 3901, 
              4392, 4941, 5556, 6244, 7014, 7875, 8839, 9917, 11124, 12474, 13984, 
              15675, 17566, 19682]

converted_settings_dict = helper.convert_to_freq_gain_tuples(settings_dict, freq_bands)
# print(converted_settings_dict['cold'])

# Converting all parameters into tensors
tensor_settings = helper.convert_to_tensors(converted_settings_dict)
# print(tensor_settings['cold'])

In [4]:
# Loading audealize ground truth files
all_audealize_samples = helper.load_and_find_path_with_keyword(ASSETS_DIR, ["audealize"], returnSingle=False)
# all_audealize_samples = helper.load_and_find_path_with_keyword(ASSETS_DIR, ["audealize", "piano"], returnSingle=False)

print(all_audealize_samples)

[PosixPath('/home/annie/research/text2fx/assets/audealize_examples/drums.wav'), PosixPath('/home/annie/research/text2fx/assets/audealize_examples/guitar.wav'), PosixPath('/home/annie/research/text2fx/assets/audealize_examples/piano.wav')]


##### Generating Audealize Gnd Truth Examples, uncomment to do

In [7]:
# # Original Input
# audio_type = "speech"

# #loading clean file
# input_raw= helper.load_and_find_path_with_keyword(ASSETS_DIR, ["225"], returnSingle=True) #searches for file
# input_sig = AudioSignal(input_raw).to_mono()

# # generating output files
# for word, freq_gains in tensor_settings.items():
#     filtered_sig, fs = helper.dasp_apply_EQ_file(input_raw, freq_gains)
#     filter_out = AudioSignal(filtered_sig,input_sig.sample_rate)
#     print(f'saving {word}')

#     EXPORT_EX_DIR = Path(EXPORT_EXAMPLES_DIR / f"{audio_type}")
#     EXPORT_EX_DIR.mkdir(exist_ok=True)

#     filter_out.write(Path(EXPORT_EX_DIR, f"{word}.wav"))

saving warm




saving cold
saving soft
saving loud
saving happy
saving bright
saving soothing
saving harsh
saving heavy
saving cool


### Comparing Audealize Files with Other Embeddings

### Audealize vs MS CLAP: 
- FX: Just EQ
- Words: Top 10 Frequent EQ words (warm 64, cold 34, soft 29, loud 26, happy 22, bright 19, soothing 17, harsh 16, heavy 15, cool 14)



##### Audio Type: Guitar Riff

In [18]:
keyword = 'speech'
# Loading Ground Truth Paths
audealize_out_ALL = helper.load_and_find_path_with_keyword(EXPORT_EXAMPLES_DIR, [f'{keyword}'], returnSingle=False)
print(len(audealize_out_ALL)) #checking length, should be number of words aka 10
# for path in audealize_out_ALL:
#     print(path)

# Loading all MS CLAP output files
MS_CLAP_OUTPUTS = EXP_AUDEALIZE_DIR / "ms_clap" 
msclap_out_ALL = helper.load_and_find_path_with_keyword(MS_CLAP_OUTPUTS, [f'{keyword}', "final"])
print(len(msclap_out_ALL))  #checking length, should be number of words aka 10
# for path in msclap_out_ALL:
#     print(path)

10
20


In [9]:
target_word = 'warm'

audealize_out_word = helper.load_and_find_path_with_keyword(EXPORT_EXAMPLES_DIR, ["guitar", f"{target_word}"], returnSingle=True)
msclap_out_word = helper.load_and_find_path_with_keyword(MS_CLAP_OUTPUTS, ["guitar", "final", f"{target_word}"], returnSingle=True)

print(audealize_out_word)
print(msclap_out_word)

/home/annie/research/text2fx/experiments/audealize_comp/audealize_version/guitar/warm.wav
/home/annie/research/text2fx/experiments/audealize_comp/ms_clap/guitar/this_sound_is_warm/final.wav


In [10]:
helper.compare_loss_anyfiles(audealize_out_word, msclap_out_word)

tensor(7.3112)

In [11]:
def calculate_losses(audio_type, output_dir=MS_CLAP_OUTPUTS, verbose=True):
    losses = []
    for word in top10_eq:
        target_word = word
        audealize_out_word = helper.load_and_find_path_with_keyword(EXPORT_EXAMPLES_DIR, [f"{audio_type}", f"{target_word}"], returnSingle=True)
        msclap_out_word = helper.load_and_find_path_with_keyword(output_dir, [f"{audio_type}", "final", f"{target_word}"], returnSingle=True)
        if verbose:
            print(f'AUDEALIZE_OUT PATH: {audealize_out_word}')
            print(f'MS_CLAP_OUT PATH: {msclap_out_word}')

        loss = helper.compare_loss_anyfiles(audealize_out_word, msclap_out_word)
        losses.append((word, loss))
        print(audio_type, word, loss)
    return losses

In [12]:
losses = calculate_losses("guitar", verbose=False)

guitar warm tensor(7.3112)
guitar cold tensor(2.0035)
guitar soft tensor(2.0129)
guitar loud tensor(4.6183)
guitar happy tensor(2.5699)
guitar bright tensor(9.2267)
guitar soothing tensor(1.3768)
guitar harsh tensor(3.9483)
guitar heavy tensor(3.5747)
guitar cool tensor(1.4394)


In [13]:
losses = calculate_losses("drums", verbose=False)

drums warm tensor(1.6901)
drums cold tensor(1.0105)
drums soft tensor(3.5292)
drums loud tensor(2.5581)
drums happy tensor(1.3005)
drums bright tensor(2.5930)
drums soothing tensor(1.8868)
drums harsh tensor(2.3616)
drums heavy tensor(4.1008)
drums cool tensor(2.0344)


In [14]:
losses = calculate_losses("piano", verbose=False)

piano warm tensor(1.4968)
piano cold tensor(1.9354)
piano soft tensor(2.5631)
piano loud tensor(4.7190)
piano happy tensor(2.0785)
piano bright tensor(6.0401)
piano soothing tensor(2.0321)
piano harsh tensor(4.4739)
piano heavy tensor(3.2222)
piano cool tensor(1.9128)


In [17]:
# for speech, need an extra arg

In [15]:
def calculate_losses_superlatives(audio_type, output_dir=MS_CLAP_OUTPUTS, verbose=True):
    losses = []
    for word in top10_eq:
        target_word = word
        audealize_out_word = helper.load_and_find_path_with_keyword(EXPORT_EXAMPLES_DIR, [f"{audio_type}", f"{target_word}"], returnSingle=True)
        msclap_out_word = helper.load_and_find_path_with_keyword(output_dir, [f"{audio_type}", "final", f"{target_word}", "normal"], returnSingle=True)
        if verbose:
            print(f'AUDEALIZE_OUT PATH: {audealize_out_word}')
            print(f'MS_CLAP_OUT PATH: {msclap_out_word}')

        loss = helper.compare_loss_anyfiles(audealize_out_word, msclap_out_word)
        losses.append((word, loss))
        print(audio_type, word, loss)
    return losses

In [16]:
losses = calculate_losses_superlatives("speech", verbose=False)

speech warm tensor(1.6165)
speech cold tensor(1.5096)
speech soft tensor(2.4517)
speech loud tensor(3.8767)
speech happy tensor(1.7991)
speech bright tensor(1.7889)
speech soothing tensor(1.4891)
speech harsh tensor(2.5302)
speech heavy tensor(3.7243)
speech cool tensor(1.7167)
