## QBH

In [47]:
import os, sys, csv
import essentia
from essentia import *
from essentia.standard import *
from essentia import standard as stdess
from pylab import *
from numpy import *
import matplotlib.pyplot as plt
import numpy as np
import scipy
import librosa
import librosa.display
import IPython.display as ipd
import pandas as pd
import glob
import pickle

### parameters

hopSize = 128
frameSize = 2048
sampleRate = 44100
guessUnvoiced = True

median_filter_size = 21
block_chromagram = 150
ref_note_hz = 55.0

notes = librosa.hz_to_note([55.0,58.27,61.74,65.41,69.30,73.42,77.78,82.41,87.31,92.50,97.99,103.83])

### directories

ref_path = "/home/acalhau/Documentos/Dataset/audios_orig_covers"
query_path = "/home/acalhau/my_env/query_by_humming/src"
results_path = "/home/acalhau/Documentos/tests/qbh"
metadata_path = "/home/acalhau/Documentos/Dataset/metadata/TabInPython.csv"

### read audio

def read_audio_essentia(filename, sampleRate):
    # create an audio loader and import audio file
    loader = essentia.standard.MonoLoader(filename = filename, sampleRate = sampleRate, downmix="mix")
    audio = loader()
    
    
    return audio, sampleRate

### melody extraction

def melody_extraction(audio, framesize, hopsize):
    # PitchMelodia takes the entire audio signal as input - no frame-wise processing is required here...
    audio = EqualLoudness()(audio)
    pExt = PredominantPitchMelodia(frameSize = framesize, 
                                   hopSize = hopsize,
                                   maxFrequency = 20000.0,
                                   minFrequency = 80.0,
                                   magnitudeThreshold = 40,
                                   guessUnvoiced = False)
    pitch, pitchConf = pExt(audio)
    time=np.linspace(0.0,len(audio)/sampleRate,len(pitch) )
    
    return pitch, pitchConf, time

### scales conversions

def hz_to_cents(hz, fref=440.0):
    m = np.asanyarray(hz)/fref
    return 1200*(np.log2(m, out=np.zeros_like(m), where=(m!=0.))) # - np.log2(fref))

def cents_to_hz(cents, fref=440.0):
    return fref * 2**(np.asanyarray(cents)/1200)

def hz_to_idx_semitone(hz, fref=440.0, pcpsize=12.0):
    m = np.asanyarray(hz)/fref
    return np.round(pcpsize*(np.log2(m, out=np.zeros_like(m), where=(m!=0.)))) % pcpsize + 1

def semitone_dist_to_hz(semitone, fref=440.0, pcpsize=12):
    return fref * 2**(np.asanyarray(semitone)/pcpsize)

def hz_to_octs(hz, tuning=0.0, pcpsize=12, fref=440.0):
    A440 = fref * 2.0 ** (tuning / pcpsize-4)
    m = np.asanyarray(hz)/float(A440)
    return np.log2(m, out=np.zeros_like(m), where=(m!=0.))

def octs_to_hz(octs, tuning=0.0, pcpsize=12, fref=440.0):
    A440 = fref * 2.0 ** (tuning / pcpsize-4)
    return float(A440) * (2.0 ** np.asanyarray(octs))

def midi_to_hz(p):
    return 440.0 * (2.0 ** ((np.asanyarray(p) - 69.0) / 12.0))

def hz_to_midi(hz, A440=440.0):
    m = np.asanyarray(hz)/float(A440)
    return 12 * np.log2(m, out=np.zeros_like(m), where=(m!=0.)) + 69

### semitone quantization

def quantization_semitone(pitch_cents):
    
    freqs=np.arange(-0.5,76,1)
    semitones=semitone_dist_to_hz(freqs,fref=ref_note_hz)
    bins_cents=hz_to_cents(semitones, fref=ref_note_hz)

    idx = np.digitize(pitch_cents, bins=bins_cents)
    cents_q = (bins_cents[idx]+bins_cents[idx-1])/2
    cents_q[cents_q < 1.0]  = 0.0

    return cents_q

### semitones to octaves and chromagram

def semitone_to_octave(cents, pitchConf):
    
    semitones_q = np.zeros_like(cents)
    chromagram = np.zeros((12, len(cents)))

    for i in range(len(cents)):
        if cents[i] > 1.0:
            semitones_q[i] = (cents[i] / 100) % 12
            chromagram[int(semitones_q[i]),i] = pitchConf[i]
            
    return semitones_q, chromagram

### chromagram reduction

def chromagram_reduction(chromagram, block, time=None):
    
    nframes = int(np.ceil(chromagram.shape[1]/block))

    chromagram_reduced = np.zeros((chromagram.shape[0],nframes))
    
    time_reduced = None
    if time is not None:
        time_reduced = np.zeros((nframes,))

    for k in range(chromagram_reduced.shape[1]):
        start = k*block
        end = min((k+1)*block,chromagram.shape[1])
        chromagram_reduced[:,k] =  np.sum(chromagram[:,start:end], axis=1)
        if time is not None:
            time_reduced[k] = np.mean(time[start:end])
    
    # divide by max each column
    #chromagram_reduced = chromagram_reduced/(np.max(chromagram_reduced,axis=0)+np.spacing(0)) 
    
    return chromagram_reduced, time_reduced

### plot chromagram

def plot_chromagram(chroma, times, notes, mappable = None):
    plt.figure(figsize=(20,8))
    plt_spec = plt.imshow(chroma,origin='lower', aspect="auto")

    ## create ylim
    ks      = np.arange(12)
    plt.yticks(ks,notes)
    plt.ylabel("Notes")

    ## create xlim
    Nxticks = 10
    ts_spec = np.linspace(0,chroma.shape[1],Nxticks)

    ts_spec_sec  = ["{:4.2f}".format(times[min(int(i),len(times)-1)]) for i in ts_spec]
    plt.xticks(ts_spec,ts_spec_sec)
    plt.xlabel("Time (sec)")

    plt.title("Chromagram")
    plt.colorbar(mappable,use_gridspec=True)
    plt.show()
    return(plt_spec)

### chromagram extraction

def chromagram_extraction(audio, frameSize, hopSize):
    # Melody Extraction
    pitch, pitchConf, time = melody_extraction(audio, frameSize, hopSize)
    # Convert to cents
    pitch_cents = hz_to_cents(pitch, fref=ref_note_hz)
    # Quantization
    pitch_cents_q = quantization_semitone(pitch_cents)
    # Median Filter
    pitch_cents_q_filtered=scipy.signal.medfilt(pitch_cents_q, median_filter_size)
    # Chromagram
    pitch_semitones_q, chromagram = semitone_to_octave(pitch_cents_q_filtered, pitchConf)
    # Chromagram reduction
    chromagram_reduced, time_reduced = chromagram_reduction(chromagram, block_chromagram, time=time)
    
    return chromagram_reduced, time_reduced

## Algorithm

In [1858]:
### load and extract reference dataset

filenames = glob.glob(os.path.join(ref_path,"*.wav"))

ref_melody = []
ref_dic = {}

for k,filepath in enumerate(filenames):
    file = os.path.basename(filepath)[:-4]
    #print("{:d} - {:d} - {}".format(k, len(filenames), file))

    ref_dic[file] = {}
    ref_dic[file]["songid"] = file
    
    audio, fs = read_audio_essentia(filepath, sampleRate)
    chromagram_reduced, time_reduced = chromagram_extraction(audio, frameSize, hopSize)
    
    ref_dic[file]["chroma"] = chromagram_reduced.T.astype(np.float32)
    ref_dic[file]["time"] = time_reduced
    
### saving dataset_reference in a pickle file

fileRef = "ChromaRef.pkl"

with open(os.path.join(results_path, fileRef),"wb") as f:
    pickle.dump(ref_dic,f)

0 - 260 - 126_1
1 - 260 - 1438_2
2 - 260 - 542_1
3 - 260 - 1576
4 - 260 - 2045_2
5 - 260 - 1209_2
6 - 260 - 2113_2
7 - 260 - 941_2
8 - 260 - 1568_2
9 - 260 - 912
10 - 260 - 1488_2
11 - 260 - 2269_1
12 - 260 - 2090
13 - 260 - 1441
14 - 260 - 2031_2
15 - 260 - 593_orig
16 - 260 - 479_2
17 - 260 - 1224_1
18 - 260 - 1616_2
19 - 260 - 2146_1
20 - 260 - 593_1
21 - 260 - 70_2
22 - 260 - 1835_2
23 - 260 - 1209
24 - 260 - 768_1
25 - 260 - 663_2
26 - 260 - 1546_2
27 - 260 - 1579
28 - 260 - 941
29 - 260 - 668_orig
30 - 260 - 2024_2
31 - 260 - 1849_1
32 - 260 - 2133
33 - 260 - 593
34 - 260 - 1194_1
35 - 260 - 2024_1
36 - 260 - 14
37 - 260 - 253_1
38 - 260 - 2074_2
39 - 260 - 1832
40 - 260 - 1832_1
41 - 260 - 253_2
42 - 260 - 708
43 - 260 - 70_1
44 - 260 - 70_orig
45 - 260 - 1175_2
46 - 260 - 2074
47 - 260 - 1142_2
48 - 260 - 1592
49 - 260 - 2090_2
50 - 260 - 2113_1
51 - 260 - 60
52 - 260 - 597
53 - 260 - 597_2
54 - 260 - 479_1
55 - 260 - 1808_2
56 - 260 - 1252
57 - 260 - 1411
58 - 260 - 2024
59 - 

In [23]:
### recording query
import sounddevice as sd
from scipy.io.wavfile import write
import wavio as wv

from datetime import datetime

# datetime object containing current date and time
now = datetime.now()

# dd/mm/YY H:M:S
dt_string = now.strftime("q%d%m%H%M%S.wav")
#print(dt_string)	

def recording_sounddevice():
    freq = 44100      #sampling frequency
    duration = 10     #recording duration
    
    # start recorder with the given values of duration and sample frequency
    recording = sd.rec(int(duration * freq), 
                   samplerate=freq,
                  channels=1
                  )
    sd.wait()   # record audio for the given number of seconds
    wv.write(dt_string, recording, freq, sampwidth=2) #convert the NumPy array to audio file
    

recording_sounddevice()

In [24]:
print('Sonification of audio):')
ipd.display(ipd.Audio(dt_string, rate=44100))

Sonification of audio):


In [76]:
### load and extract query dataset

filenames = glob.glob(os.path.join(query_path,dt_string))

query_dic = {}

for k,filepath in enumerate(filenames):
    file = os.path.basename(filepath)[:-4]
    #print("{:d} - {:d} - {}".format(k, len(filenames), file))
    query = file.split(".")[0]
    query_dic[file] = {}
    query_dic[file]["query"] = query
    #query_dic[file]["songid"] = Queries_df[Queries_df["Query ID"] == file]["Song ID"].values.astype(str)[0]
    
    audio, fs = read_audio_essentia(filepath, sampleRate)
    chromagram_reduced, time_reduced = chromagram_extraction(audio, frameSize, hopSize)
    
    query_dic[file]["chroma"] = chromagram_reduced.T.astype(np.float32)
    query_dic[file]["time"] = time_reduced
    
### saving dataset_query in a pickle file

fileQuery = "ChromaQuery.pkl"

with open(os.path.join(results_path, fileQuery),"wb") as f:
    pickle.dump(query_dic,f)

In [77]:
### read chromas

fileQuery = "ChromaQuery.pkl"

with open(os.path.join(results_path, fileQuery),"rb") as f:
    query_dic = pickle.load(f)

fileRef = "ChromaRef.pkl"

with open(os.path.join(results_path, fileRef),"rb") as f:
    ref_dic = pickle.load(f)

In [79]:
### comparisons

#pChromaCrossSim = ChromaCrossSimilarity(frameStackSize=1, binarizePercentile=0.095)

#pCoverSongSim = CoverSongSimilarity(alignmentType="serra09", 
#                                    distanceType = 'asymmetric')

#distanceType = 'symmetric': retorna similaridade s/normalização
#distanceType = 'asymmetric': retorna distância normalizada (np.sqrt(yFrames) / distance)

algorithm = "serra09" # {"serra09", "chen17", "typeI"}
distanceType = "symmetric"
disOnset=0.5
disExtension=0.5
#cellmax = True
cellmax = False

pChromaCrossSim = stdess.ChromaCrossSimilarity(frameStackSize=1)
pCoverSongSim = stdess.CoverSongSimilarity(alignmentType=algorithm, 
                                           distanceType= distanceType,
                                           disOnset=disOnset,disExtension=disExtension) 
results = {}

for k,q in enumerate(query_dic.keys()):
    #print("{:d} - {:d} - {}".format(k, len(query_dic), query_dic[q]["query"]))
    
    results[q] = {"query": query_dic[q]["query"],
                  #"songid": query_dic[q]["songid"],
                  "res": []}
    
    for r in ref_dic.keys():

        csm = pChromaCrossSim(query_dic[q]["chroma"], ref_dic[r]["chroma"])
        scoreMatrix, distance = pCoverSongSim(csm)
     
        ## normalização symmetric
        #distance = (idmax[0])/((idmax[1])-(P[0][1])) #sim1 normalizada
        #distance =(P[-1][0])/((P[-1][1])-(P[0][1])) #sim2 normalizada
    
    
        res = {}
        res["scoreMatrix"] = scoreMatrix
        res["distance"] = distance
        res["csm"] = csm
        res["ref"] = r
        
        results[q]["res"].append(res)
        # print(res['ref'], res['distance'])
        
        ##normalização p/symmetric
        #distance = (P[-1][0])/(((P[-1][1])-(P[0][1])+1))
        
fileResults = "Results.pkl"

with open(os.path.join(results_path, fileResults),"wb") as f:
    pickle.dump(results,f)

In [80]:
### results ordered by distances

def distSort(elem):
    return elem["distance"]

query = dt_string[:-4]

results[query]["res"].sort(reverse=True, key=distSort)

### load metadata

if (0.9*(results[query]["res"][0]['distance'])) <= (results[query]["res"][1]['distance']):
    print('Não foi encontrada correspondência!') 
else:
    print('A música correspondente é:',(results[query]["res"][0]['ref']))
 
    print('Top 10:\n','\n'.join([str(results[query]["res"][n]['ref']) 
                             + ' | ' + str(results[query]["res"][n]['distance']) for n in range(0,10)]))


A música correspondente é: 708_1
Top 10:
 708_1 | 4.0
1568_2 | 3.5
1786_1 | 3.5
937_1 | 3.5
96_2 | 3.5
1441_1 | 3.5
1194 | 3.5
126_1 | 3.0
542_1 | 3.0
1209_2 | 3.0


## LookingForMusic - Interface

In [87]:
import PySimpleGUI as sg 
from PIL import Image
import sounddevice as sd
from scipy.io.wavfile import write
import wavio as wv
from datetime import datetime

sg.theme('dark grey 9')
  
# datetime object containing current date and time
now = datetime.now()

# dd/mm/YY H:M:S
dt_string = now.strftime("q%d%m%H%M%S.wav")
#print(dt_string)	

def recording_sounddevice():
    freq = 44100      #sampling frequency
    duration = 10     #recording duration
    
    # start recorder with the given values of duration and sample frequency
    recording = sd.rec(int(duration * freq), 
                   samplerate=freq,
                  channels=1
                  )
    sd.wait()   # record audio for the given number of seconds
    wv.write(dt_string, recording, freq, sampwidth=2) #convert the NumPy array to audio file
    
def load_query():
    filenames = glob.glob(os.path.join(query_path,dt_string))

    query_dic = {}

    for k,filepath in enumerate(filenames):
        file = os.path.basename(filepath)[:-4]
        #print("{:d} - {:d} - {}".format(k, len(filenames), file))
        query = file.split(".")[0]
        query_dic[file] = {}
        query_dic[file]["query"] = query
        #query_dic[file]["songid"] = Queries_df[Queries_df["Query ID"] == file]["Song ID"].values.astype(str)[0]

        audio, fs = read_audio_essentia(filepath, sampleRate)
        chromagram_reduced, time_reduced = chromagram_extraction(audio, frameSize, hopSize)

        query_dic[file]["chroma"] = chromagram_reduced.T.astype(np.float32)
        query_dic[file]["time"] = time_reduced

    ### saving dataset_query in a pickle file

    fileQuery = "ChromaQuery.pkl"

    with open(os.path.join(results_path, fileQuery),"wb") as f:
        pickle.dump(query_dic,f)

def distSort(elem):
    return elem["distance"]        
  
    
layout = [
    [sg.Text('Welcome From LookingForMusic!')],
    [sg.Text('You can sing, hum or whistle your song.')],
    [sg.Text('Press the button:')],
    [sg.Button(image_filename='buttonBlue.png', image_size=(380,380), image_subsample=4, border_width=0)]
]

window = sg.Window('LookingForMusic', layout)


while True:             # Event Loop
    event, values = window.read()
    if event == sg.WIN_CLOSED:
        break
    else:
        recording_sounddevice()
        
        Queries_df = pd.read_csv(metadata_path, sep=",")
        
        print('A pesquisar...')
        
        load_query()
        
        ### read chromas ###
        fileQuery = "ChromaQuery.pkl"
        with open(os.path.join(results_path, fileQuery),"rb") as f:
            query_dic = pickle.load(f)

        fileRef = "ChromaRef.pkl"
        with open(os.path.join(results_path, fileRef),"rb") as f:
            ref_dic = pickle.load(f)
        ###########################################################
        
        algorithm = "serra09" # {"serra09", "chen17", "typeI"}
        distanceType = "symmetric"
        disOnset=0.5
        disExtension=0.5
        #cellmax = True
        cellmax = False

        pChromaCrossSim = stdess.ChromaCrossSimilarity(frameStackSize=1)
        pCoverSongSim = stdess.CoverSongSimilarity(alignmentType=algorithm, 
                                                   distanceType= distanceType,
                                                   disOnset=disOnset,disExtension=disExtension) 
        results = {}

        for k,q in enumerate(query_dic.keys()):
            #print("{:d} - {:d} - {}".format(k, len(query_dic), query_dic[q]["query"]))

            results[q] = {"query": query_dic[q]["query"],
                          #"songid": query_dic[q]["songid"],
                          "res": []}

            for r in ref_dic.keys():

                csm = pChromaCrossSim(query_dic[q]["chroma"], ref_dic[r]["chroma"])
                scoreMatrix, distance = pCoverSongSim(csm)

                ## normalização symmetric
                #distance = (idmax[0])/((idmax[1])-(P[0][1])) #sim1 normalizada
                #distance =(P[-1][0])/((P[-1][1])-(P[0][1])) #sim2 normalizada


                res = {}
                res["scoreMatrix"] = scoreMatrix
                res["distance"] = distance
                res["csm"] = csm
                res["ref"] = r

                results[q]["res"].append(res)

        fileResults = "Results.pkl"

        with open(os.path.join(results_path, fileResults),"wb") as f:
            pickle.dump(results,f)
        
        ### callback ###
        query =(dt_string[:-4])
        
        results[query]["res"].sort(reverse=True, key=distSort)

        print('Pesquisa concluída!')
        ### load metadata

        #songid = Queries_df[Queries_df["Song ID"] == (results[query]["res"][0]['ref'])]["Title"].values.astype(str)[0]
        
        if (0.9*(results[query]["res"][0]['distance'])) <= (results[query]["res"][1]['distance']):
            #if(Queries_df[Queries_df["Song ID"] == (results[query]["res"][0]['ref'])]["Title"].values.astype(str)[0]))
            sg.popup('Results', 'The result of your search was: ', 'No match found!')
            #songid = 'No match found!'
        else:
            sg.popup('Results', 'The result of your search was: ', Queries_df[Queries_df["Song ID"] == (results[query]["res"][0]['ref'])]["Title"].values.astype(str)[0])
            #songid = 'The corresponding song is:', Queries_df[Queries_df["Song ID"] == (results[query]["res"][0]['ref'])]["Title"].values.astype(str)[0]

        #x = ('Top 10:\n','\n'.join([str(results[query]["res"][n]['ref']) 
        #                             + ' | ' + str(results[query]["res"][n]['distance']) for n in range(0,10)]))


        #sg.popup('Results', 'The result of your search was: ', songid)
        

window.close()

A pesquisar...
Pesquisa concluída!
A pesquisar...
Pesquisa concluída!
A pesquisar...
Pesquisa concluída!
