In [48]:
from transformers import AutoFeatureExtractor, AutoModel
from IPython.display import Audio as player
from datasets import load_dataset, Audio
from qdrant_client import QdrantClient
from qdrant_client.http import models
from os.path import join
from pathlib import Path
from glob import glob
import pandas as pd
import numpy as np
import librosa
import torch

In [49]:
client = QdrantClient(host="localhost", port=6333)

In [50]:
my_collection = "music_collection"
client.recreate_collection(
    collection_name=my_collection,
    vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE)
)

True

# 2. Data Prep

In [51]:
data_path = Path(f"ludwig-dataset")
data_path

PosixPath('ludwig-dataset')

In [52]:
data_dir = data_path / "mp3" / "mp3" / "latin"
music_data = load_dataset(
    "audiofolder", data_dir=data_dir, split="train", drop_labels=True
).shuffle(42).select(range(400))
music_data

Resolving data files:   0%|          | 0/979 [00:00<?, ?it/s]

Dataset({
    features: ['audio'],
    num_rows: 400
})

In [53]:
music_data[115]

{'audio': {'path': '/Users/zero/projects/python-playground/DS-playground/qdrant-search/ludwig-dataset/mp3/mp3/latin/1g8TA3JM0K0kxtQi4n38qk.mp3',
  'array': array([-0.0751303 , -0.11050164, -0.11514139, ...,  0.22492811,
          0.2993504 ,  0.22477728]),
  'sampling_rate': 44100}}

In [54]:
player(music_data[115]["audio"]["array"], rate=44100)

In [55]:
ids = [
    (
        music_data[i]
        ["audio"]
        ["path"]
        .split("/")[-1]
        .replace(".mp3", "")
    )
    for i in range(len(music_data))
]
index = [num for num in range(len(music_data))]
ids[:4], index[:4]

(['4CpcXFmmVEmkVt3exUvNNZ',
  '2NWa2R7J4gerg2p6WEDvJx',
  '5JxCu2Vn1fL7lcTHESxBRD',
  '7uzTm4WG1hFLcj4QPGkVZm'],
 [0, 1, 2, 3])

In [56]:
music_data = music_data.add_column("index", index)
music_data = music_data.add_column("ids", ids)
music_data[-1]

{'audio': {'path': '/Users/zero/projects/python-playground/DS-playground/qdrant-search/ludwig-dataset/mp3/mp3/latin/4O6hfbAp9y65YyaUS1hQk7.mp3',
  'array': array([0.00000000e+00, 1.19851318e-09, 1.27332223e-09, ...,
         8.65739230e-02, 9.57795307e-02, 9.86581109e-02]),
  'sampling_rate': 44100},
 'index': 399,
 'ids': '4O6hfbAp9y65YyaUS1hQk7'}

In [57]:
label_path = data_path / "labels.json"
labels = pd.read_json(label_path)
labels.head()

Unnamed: 0,tracks
35ecMLCJ1x2giJuvHLrI1t,{'otherSubgenres': {'L': [{'S': 'electronic---...
3p0EUhkUeCNrBIZwkjmeYe,"{'otherSubgenres': {'L': []}, 'artist': {'S': ..."
0rb6HvdvWJRkyhxsfFf1ep,"{'otherSubgenres': {'L': [{'S': 'rock'}, {'S':..."
4ssD5IkaicvM3L2Ff8FPWQ,"{'otherSubgenres': {'L': []}, 'artist': {'S': ..."
586ncAs8cYRTBlrxMDfmSP,{'otherSubgenres': {'L': [{'S': 'electronic---...


In [58]:
def get_metadata(x):
    cols = ["artist", "genre", "name", "subgenres"]
    list_of_cols = []
    for col in cols:
        try:
            mdata = list(x[col].values())[0]
        except:
            mdata = "Unknown"
        list_of_cols.append(mdata)
    return pd.Series(list_of_cols, index=cols)

In [59]:
clean_labels = labels["tracks"].apply(get_metadata).reset_index()
clean_labels.head()

Unnamed: 0,index,artist,genre,name,subgenres
0,35ecMLCJ1x2giJuvHLrI1t,Riovolt,electronic,It Ain't Over 'till It's Over,"[{'S': 'electronic---ambient'}, {'S': 'electro..."
1,3p0EUhkUeCNrBIZwkjmeYe,R.L. Burnside,blues,Fireman Ring the Bell,[{'S': 'blues---country blues'}]
2,0rb6HvdvWJRkyhxsfFf1ep,Chapterhouse,rock,Falling Down,[{'S': 'rock---shoegaze'}]
3,4ssD5IkaicvM3L2Ff8FPWQ,Lowell Fulsom,funk / soul,Tramp,[{'S': 'funk / soul---rhythm & blues'}]
4,586ncAs8cYRTBlrxMDfmSP,Paul Ellis,electronic,Dissolve,[{'S': 'electronic---ambient'}]


In [60]:
def get_vals(genres):
    genre_list = []
    for dicts in genres:
        for _,val in dicts.items():
            genre_list.append(val)
    return genre_list

clean_labels["subgenres"] = clean_labels.subgenres.apply(get_vals)
clean_labels["subgenres"].head()

0    [electronic---ambient, electronic---downtempo,...
1                              [blues---country blues]
2                                    [rock---shoegaze]
3                       [funk / soul---rhythm & blues]
4                               [electronic---ambient]
Name: subgenres, dtype: object