## Run experiments

In [1]:
import lancedb

In [2]:
uri = "../../data/lancedb-data/audio-lancedb"
db = lancedb.connect(uri)



In [3]:
import pandas as pd

def test_method(test_fn, embed_fn=None):
    queries_tbl = db.open_table("audio_example_queries")
    total_rows = queries_tbl.count_rows()
    song_num_actual = []
    song_num_retrieved = []

    conditions = [
        "(offset == 0) and (pitch_shift == 0) and (time_stretch == 1.0)",
        "(offset != 0) and (pitch_shift == 0) and (time_stretch == 1.0)",
        "(offset == 0) and (pitch_shift != 0) and (time_stretch == 1.0)",
        "(offset == 0) and (pitch_shift == 0) and (time_stretch != 1.0)",
        "(offset == 0) and (pitch_shift != 0) and (time_stretch != 1.0)",
        "(offset != 0) and (pitch_shift != 0) and (time_stretch != 1.0)",
    ]

    for condition in conditions:
        print(f"Running test for condition: {condition}")
        filtered_tbl = queries_tbl.search().where(condition).select(["song_num", "vector"])

        for _, row in filtered_tbl.to_pandas().iterrows():
            if embed_fn:
                row["vector"] = embed_fn(row["vector"])
            song_num_actual.append(row["song_num"])
            retrieved_info_list = test_fn(row["vector"]).to_pandas()

            song_num_retrieved.append([retrieved_info["song_num"] 
                                       for _, retrieved_info in retrieved_info_list.iterrows()])
    return song_num_actual, song_num_retrieved


def calculate_mrr(actual_songs, retrieved_songs):
    """
    Calculate Mean Reciprocal Rank (MRR) for a list of song retrievals.

    Parameters:
    actual_songs (list of int): A list of the actual song numbers.
    retrieved_songs (list of list of int): A list of lists, where each inner list contains retrieved song numbers.

    Returns:
    float: The Mean Reciprocal Rank (MRR) score.
    """
    reciprocal_ranks = []

    for actual, retrieved in zip(actual_songs, retrieved_songs):
        try:
            # Find the rank (1-indexed) of the actual song in the retrieved list
            rank = retrieved.index(actual) + 1
            reciprocal_ranks.append(1 / rank)
        except ValueError:
            # If the actual song is not in the retrieved list, reciprocal rank is 0
            reciprocal_ranks.append(0.0)

    # Calculate the mean of the reciprocal ranks
    return sum(reciprocal_ranks) / len(reciprocal_ranks)

In [4]:
db_tbl = db.open_table("audio_dataset")

def default_search(query_vector):
    return db_tbl.search(query_vector).limit(3)

In [5]:
actual, retrieved = test_method(default_search)

Running test for condition: (offset == 0) and (pitch_shift == 0) and (time_stretch == 1.0)
Running test for condition: (offset != 0) and (pitch_shift == 0) and (time_stretch == 1.0)
Running test for condition: (offset == 0) and (pitch_shift != 0) and (time_stretch == 1.0)
Running test for condition: (offset == 0) and (pitch_shift == 0) and (time_stretch != 1.0)
Running test for condition: (offset == 0) and (pitch_shift != 0) and (time_stretch != 1.0)
Running test for condition: (offset != 0) and (pitch_shift != 0) and (time_stretch != 1.0)


In [6]:
calculate_mrr(actual, retrieved)

0.2

## Track with wandb

In [9]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mangeline-georgian[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="children-song-dataset-retrieval",

    # track hyperparameters and run metadata
    config={
    "embedding": "none",
    "retrieval": "l2",
    }
)

In [11]:
wandb.log({"mrr": calculate_mrr(actual, retrieved)})
wandb.finish()

VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mrr,▁

0,1
mrr,0.2


## Try with feature extraction

In [12]:
audio = db_tbl.search().limit(1).select(["song_num", "vector"]).to_pandas().iloc[0]["vector"]

In [13]:
import librosa
import numpy as np

def extract_features(audio, sr=44100, aggregate="summary_stat"):
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

    # Extract Chroma features
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    
    # Extract Mel-scaled spectrogram features
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)

    if aggregate == "summary_stat":
        # Aggregate the MFCCs across time
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std = np.std(mfccs, axis=1)
        mfcc_embedding = np.concatenate([mfccs_mean, mfccs_std])

        chroma_mean = np.mean(chroma, axis=1)
        chroma_std = np.std(chroma, axis=1)
        chroma_embedding = np.concatenate([chroma_mean, chroma_std])

        mel_spectrogram_mean = np.mean(mel_spectrogram, axis=1)
        mel_spectrogram_std = np.std(mel_spectrogram, axis=1)
        mel_spectrogram_embedding = np.concatenate([mel_spectrogram_mean, mel_spectrogram_std])
    
    else:
        # Flatten the MFCCs into a 1D array
        mfcc_embedding = mfccs.flatten()
        chroma_embedding = chroma.flatten()
        mel_spectrogram_embedding = mel_spectrogram.flatten()
   
    return np.concatenate([mfcc_embedding, chroma_embedding, mel_spectrogram_embedding])

# Example usage
feat = extract_features(audio, aggregate="full")
print(feat.shape)

(131886,)


In [14]:
audio_df = db_tbl.to_pandas()

In [15]:
audio_df.head()

Unnamed: 0,vector,sample_rate,pitch_shift,time_stretch,offset,chunk_num,song_num,song_version,filename
0,"[-4.5776367e-05, -1.5258789e-05, 1.5258789e-05...",44100,0,1.0,0,1,1,a,en001a_chunk1_offset0_ps0_ts1.00
1,"[0.071502686, 0.074157715, 0.07461548, 0.07556...",44100,0,1.0,0,2,1,a,en001a_chunk2_offset0_ps0_ts1.00
2,"[-0.1270752, -0.11282349, -0.07298279, -0.0144...",44100,0,1.0,0,3,1,a,en001a_chunk3_offset0_ps0_ts1.00
3,"[-0.059661865, -0.06428528, -0.06500244, -0.06...",44100,0,1.0,0,4,1,a,en001a_chunk4_offset0_ps0_ts1.00
4,"[-0.047698975, -0.04866028, -0.048797607, -0.0...",44100,0,1.0,0,5,1,a,en001a_chunk5_offset0_ps0_ts1.00


In [18]:
## Todo re-embed the data with the new features
db_setup = False

audio_df = db_tbl.to_pandas()
batch_size = len(audio_df)//5
for i in range(0, len(audio_df), batch_size):
    print(i)
    sound_arrays = []
    for _, row in audio_df.iloc[i:i+batch_size].iterrows():
        sound_arrays.append(
            {
                "vector": extract_features(audio, aggregate="summary_stat"),
                "sample_rate": row["sample_rate"],
                "offset": row["offset"],
                "pitch_shift": row["pitch_shift"],
                "time_stretch": row["time_stretch"],
                "song_num": row["song_num"],
                "song_version": row["song_version"],
                "chunk_num": row["chunk_num"],
                "filename": row["filename"],
            }
        )
   

    if db_setup:
        feat_tbl.add(sound_arrays)
    else:
        feat_tbl = db.create_table("audio_feat_eng_sumstat", data=sound_arrays)
        db_setup = True

    


0


KeyboardInterrupt: 

In [None]:
sumstat_tbl = db.open_table("audio_feat_eng_sumstat")

def default_search_sumstat(query_vector):
    return sumstat_tbl.search(query_vector).limit(3)

In [None]:
fullfeat_tbl = db.open_table("audio_feat_eng_full")

def default_search_fullfeat(query_vector):
    return fullfeat_tbl.search(query_vector).limit(3)

In [None]:
actual, retrieved = test_method(default_search_sumstat, sumsat_tbl)
print(calculate_mrr(actual, retrieved))
actual, retrieved = test_method(default_search_sumstat, fullfeat_tbl)
print(calculate_mrr(actual, retrieved))

## TODO if not that good, try using hubert to embed