In [3]:
import numpy as np
from python_bash_code import file_crop_beat
import time
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd
from libfmp.b.b_plot import plot_signal, plot_chromagram
from libfmp.c3.c3s2_dtw_plot import plot_matrix_with_points
from synctoolbox.feature.utils import estimate_tuning
from synctoolbox.feature.pitch_onset import audio_to_pitch_onset_features
from synctoolbox.feature.dlnco import pitch_onset_features_to_DLNCO
from synctoolbox.feature.pitch import audio_to_pitch_features
from synctoolbox.feature.chroma import pitch_to_chroma, quantize_chroma
from synctoolbox.feature.chroma import quantized_chroma_to_CENS
from synctoolbox.dtw.utils import compute_optimal_chroma_shift
from synctoolbox.dtw.utils import shift_chroma_vectors
from synctoolbox.dtw.mrmsdtw import sync_via_mrmsdtw
from synctoolbox.dtw.utils import make_path_strictly_monotonic

import pandas as pd
import scipy
from synctoolbox.dtw.utils import evaluate_synchronized_positions
header_name = ["time","beat"]


from synctoolbox.dtw.core import compute_warping_path
from synctoolbox.dtw.cost import cosine_distance

In [4]:
def get_features_from_audio(audio, tuning_offset, visualize=True):
    f_pitch = audio_to_pitch_features(f_audio=audio, Fs=Fs, tuning_offset=tuning_offset, feature_rate=feature_rate)
    f_chroma = pitch_to_chroma(f_pitch=f_pitch)
    f_chroma_quantized = quantize_chroma(f_chroma=f_chroma)

    f_pitch_onset = audio_to_pitch_onset_features(f_audio=audio, Fs=Fs, tuning_offset=tuning_offset)
    f_DLNCO = pitch_onset_features_to_DLNCO(f_peaks=f_pitch_onset, feature_rate=feature_rate, feature_sequence_length=f_chroma_quantized.shape[1])
    return f_chroma_quantized, f_DLNCO

In [5]:
#Parameters
#147.1 and 329.1 bars correctly anotated 

Fs = 22050
beat_start = 147.1
beat_stop = 210.1
feature_rate = 25
step_weights = np.array([1.5, 1.5, 2.0]) #DTW step weights (up, left and diagonal)
threshold_rec = 10 ** 6 

#Croping or saving the alignement
crop = False
save = True
compare = True

In [6]:
audio_path_ref = "/home/osboxes/automatic_alignement/data_sync_experiment/1965_Bernstein_Mahler_IX-1"
audio_path_align = "/home/osboxes/automatic_alignement/data_sync_experiment/2003_Gielen_Mahler_IX-1"

file_path_ref = "/home/osboxes/automatic_alignement/data_sync_experiment/1965_Bernstein_Mahler_IX-1_bpb_147-329"
file_path_align = "/home/osboxes/automatic_alignement/data_sync_experiment/2003_Gielen_Mahler_IX-1_bpb_CU_final_updated_22-11-2021"

In [7]:
if crop :
    file_crop_beat(beat_start = beat_start, beat_stop = beat_stop, data_path = file_path_ref+'.txt', audio_path = audio_path_ref+'.mp3')
    file_crop_beat(beat_start = beat_start, beat_stop = beat_stop, data_path = file_path_align+'.txt', audio_path = audio_path_align+'.mp3')

In [8]:
audio_ref, _ = librosa.load(audio_path_ref + "_s"+str(beat_start)+"_e"+str(beat_stop)+".wav", Fs)
audio_align, _ = librosa.load(audio_path_align +"_s"+str(beat_start)+"_e"+str(beat_stop)+".wav", Fs)

tuning_offset_ref = estimate_tuning(audio_ref, Fs)
tuning_offset_align = estimate_tuning(audio_align, Fs)

f_chroma_quantized_ref, f_DLNCO_ref = get_features_from_audio(audio_ref, tuning_offset_ref)
f_chroma_quantized_align, f_DLNCO_align = get_features_from_audio(audio_align, tuning_offset_align)

f_cens_1hz_ref = quantized_chroma_to_CENS(f_chroma_quantized_ref, 201, 50, feature_rate)[0]
f_cens_1hz_align = quantized_chroma_to_CENS(f_chroma_quantized_align, 201, 50, feature_rate)[0]
opt_chroma_shift = compute_optimal_chroma_shift(f_cens_1hz_ref, f_cens_1hz_align)

f_chroma_quantized_align = shift_chroma_vectors(f_chroma_quantized_align, opt_chroma_shift)
f_DLNCO_align = shift_chroma_vectors(f_DLNCO_align, opt_chroma_shift)

wp = sync_via_mrmsdtw(f_chroma1=f_chroma_quantized_ref, f_onset1=f_DLNCO_ref, f_chroma2=f_chroma_quantized_align, f_onset2=f_DLNCO_align, input_feature_rate=feature_rate, step_weights=step_weights, threshold_rec=threshold_rec)

................................................................................................................................................................................................................................................................................................................................................................

In [9]:
print(audio_align.shape, audio_ref.shape)


(4367848,) (4431024,)


In [10]:
wp2 = make_path_strictly_monotonic(wp)

In [11]:
beat_annotations_ref = pd.read_csv(filepath_or_buffer=file_path_ref+"_s"+str(beat_start)+"_e"+str(beat_stop)+".csv",names = header_name)
beat_annotations_align = pd.read_csv(filepath_or_buffer=file_path_align+"_s"+str(beat_start)+"_e"+str(beat_stop)+".csv", names = header_name)
beat_annotations_align = beat_annotations_align.loc[beat_annotations_align['beat'].isin(beat_annotations_ref['beat'])].reset_index(drop = True) #We make sure that we compare only the same beats 


beat_positions_ref_transferred_to_align = scipy.interpolate.interp1d(wp2[0]/ feature_rate , wp2[1]/ feature_rate , kind='linear')(beat_annotations_ref["time"])


mean_absolute_error, accuracy_at_tolerances = evaluate_synchronized_positions(beat_annotations_align["time"] * 1000, beat_positions_ref_transferred_to_align * 1000)

Measure transfer from recording 1 to 2 yielded:

Mean absolute error (MAE): 85.35ms (standard deviation: 84.03ms)

Accuracy of transferred positions at different tolerances:
			Accuracy
################################
Tolerance: 10 ms 	11.86 %
Tolerance: 20 ms 	23.32 %
Tolerance: 30 ms 	31.62 %
Tolerance: 40 ms 	38.34 %
Tolerance: 50 ms 	41.90 %
Tolerance: 60 ms 	49.01 %
Tolerance: 70 ms 	55.34 %
Tolerance: 80 ms 	59.29 %
Tolerance: 90 ms 	64.82 %
Tolerance: 100 ms 	68.38 %
Tolerance: 150 ms 	83.00 %
Tolerance: 250 ms 	94.86 %


In [12]:
#Saving the Anotation transfered to the aligned audio
if save : 
    beat_position_transfered_ref_to_align = pd.DataFrame(data = beat_positions_ref_transferred_to_align, columns = ["time"])
    beat_position_transfered_ref_to_align["beat"] = beat_annotations_ref["beat"]
    beat_position_transfered_ref_to_align.to_csv('/home/osboxes/automatic_alignement/data_sync_experiment/Comparison_Match_MrMsDTW/MrMsDTW_1965ref_2003_'+str(beat_start)+'_'+str(beat_stop)+'.csv', header = False, index = False)

In [13]:
#Comparison between ref and Match results

if compare :
    match_anotation_align = np.loadtxt('/home/osboxes/automatic_alignement/data_sync_experiment/Comparison_Match_MrMsDTW/MATCH_2003ref_1965_'+str(beat_start)+'_'+str(beat_stop)+'.txt',delimiter = ',')
    match_anotation_align = np.concatenate((np.array([[0, 0]]), match_anotation_align))
    mean_absolute_error, accuracy_at_tolerances = evaluate_synchronized_positions(beat_annotations_align["time"] * 1000, match_anotation_align[:,0] * 1000)

Measure transfer from recording 1 to 2 yielded:

Mean absolute error (MAE): 4090.12ms (standard deviation: 2060.01ms)

Accuracy of transferred positions at different tolerances:
			Accuracy
################################
Tolerance: 10 ms 	0.79 %
Tolerance: 20 ms 	0.79 %
Tolerance: 30 ms 	0.79 %
Tolerance: 40 ms 	0.79 %
Tolerance: 50 ms 	0.79 %
Tolerance: 60 ms 	0.79 %
Tolerance: 70 ms 	0.79 %
Tolerance: 80 ms 	0.79 %
Tolerance: 90 ms 	0.79 %
Tolerance: 100 ms 	0.79 %
Tolerance: 150 ms 	0.79 %
Tolerance: 250 ms 	3.16 %
