In [2]:
import utils as utils
import importlib
import numpy as np
import mir_eval
import data
import pickle
import librosa

import torch as th
import torch.nn as nn
import torch.nn.functional as F

from scipy.signal import find_peaks

importlib.reload(utils)
importlib.reload(data)

train_dataset_path = './data/train'
test_dataset_path = './data/test'

device = 'cuda' if th.cuda.is_available() else 'cpu'

In [5]:
# Model descrbied in the paper plus droput
class OnsetDetectionCNN(nn.Module):
    def __init__(self):
        super(OnsetDetectionCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=(3, 7))
        self.pool1 = nn.MaxPool2d(kernel_size=(3, 1))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(3, 3))
        self.pool2 = nn.MaxPool2d(kernel_size=(3, 1))
        self.fc1 = nn.Linear(20 * 7 * 8, 256)
        self.fc2 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(-1, 20 * 7 * 8)
        x = self.dropout(F.relu(self.fc1(x)))  # Apply dropout
        x = self.sigmoid(self.fc2(x))
        return x

# Initialize the model
model = OnsetDetectionCNN()

In [6]:
model.load_state_dict(th.load('best_model.pth'))
with open('mean_std.pkl', 'rb') as file:
    data = pickle.load(file)
mean = data['mean']
std = data['std']

# This is the almost the same as the prediction function in the onset detection but here we just use the onset signal 
# and no prediction
def raw_onset_signal(model, x, mean=mean, std=std, frame_size=15):
    model = model.to(device)
    model.eval()
    x = x.to(device)
    mean = mean.to(device)
    std = std.to(device)
    x = (x - mean) / std

    half_frame_size = frame_size // 2
    num_frames = x.shape[2]
    onset_predictions = []

    with th.no_grad():
        for j in range(half_frame_size, num_frames - half_frame_size):
            start = j - half_frame_size
            end = j + half_frame_size + 1
            input_frame = x[:, :, start:end].unsqueeze(0).float()
            output = model(input_frame).squeeze().cpu().item()
            onset_predictions.append(output)
    onset_predictions = np.array(onset_predictions)
    onset_signal = np.convolve(onset_predictions, np.hamming(10), mode='same')
    return onset_signal

def autocorrelate(signal, lag):
    r = np.zeros(len(signal) - lag)
    for t in range(len(signal) - lag):
        r[t] = signal[t + lag] * signal[t]
    return np.sum(r)

def to_bpm(max_r):
    return 60 * utils.SAMPLING_RATE / utils.HOP_LENGTH / (max_r + 25)

def autocorrelate_tao(signal, min_tao=25, max_tao=87):
    return np.array([autocorrelate(signal, tao) for tao in range(min_tao, max_tao)])

def get_tempo(model, x, top_n=2):
    onset_signal_res = raw_onset_signal(model, x)
    taos = autocorrelate_tao(onset_signal_res)
    peaks = find_peaks(taos)[0]
    highest_peaks = np.argsort(-taos[peaks])[:top_n]

    return list(reversed([to_bpm(r) for r in peaks[highest_peaks]]))

In [7]:
# def estimate_beats(onset_signal, tempo_bpm):
#     # Convert tempo to frame period
#     tempo_period = 60 / tempo_bpm * utils.SAMPLING_RATE / utils.HOP_LENGTH  # in frames
#     max_lag = int(tempo_period * 1.5)
#     min_lag = int(tempo_period * 0.5)
    
#     # Dynamic programming for beat tracking
#     dp = np.zeros(len(onset_signal))
#     backtrack = np.zeros(len(onset_signal), dtype=int)

#     for t in range(min_lag, len(onset_signal)):
#         max_score = -np.inf
#         best_lag = 0
#         for lag in range(min_lag, max_lag):
#             if t - lag >= 0:
#                 score = onset_signal[t] + dp[t - lag]
#                 if score > max_score:
#                     max_score = score
#                     best_lag = lag
#         dp[t] = max_score
#         backtrack[t] = best_lag
    
#     # Backtrack to find beat positions
#     beat_positions = []
#     t = np.argmax(dp)
#     while t >= min_lag:
#         beat_positions.append(t)
#         t -= backtrack[t]
    
#     beat_positions = beat_positions[::-1]  # reverse to get the correct order
#     return np.array(beat_positions) * utils.HOP_LENGTH / utils.SAMPLING_RATE


In [8]:
# def custom_beat_tracking(spec, sample_rate, tempo_estimations):    
#     num_frames = spec.shape[-1]
#     duration = num_frames * utils.HOP_LENGTH / sample_rate

#     onset_signal = raw_onset_signal(model, spec)

#     # Calculate the average tempo if multiple tempo estimations are provided
#     if isinstance(tempo_estimations, (list, np.ndarray)):
#         tempo = np.mean(tempo_estimations)
#     else:
#         tempo = tempo_estimations

#     taos = autocorrelate_tao(onset_signal)
#     peaks = find_peaks(taos)[0]

#     # Convert peak indices to time
#     peak_times = librosa.frames_to_time(peaks, sr=sample_rate, hop_length=utils.HOP_LENGTH)
        
#     # Initialize beat times list
#     beat_times = []

#     beat_interval = 60.0 / tempo

#     # Align detected peaks with expected beat intervals
#     for peak_time in peak_times:
#         current_beat_time = peak_time
#         while current_beat_time < duration:
#             beat_times.append(current_beat_time)
#             current_beat_time += beat_interval
    
#     return np.array(beat_times)


In [9]:
# Load the paths and then split them into train and test set (validation set in our case for now).
wav_files_paths_train, beat_files_paths_train, _, _ = utils.load_dataset_paths(train_dataset_path, is_train_dataset=True)

In [10]:
# Prepare train data
features_train, sample_rates_train = utils.preprocess_audio(wav_files_paths_train)

# tempo_train = utils.load_tempo_annotations_from_files(y_train_paths)

100%|██████████| 127/127 [00:04<00:00, 26.93it/s]


In [30]:
onsets = [raw_onset_signal(model, x, mean, std) for x in features_train]

In [31]:
tempos = [get_tempo(model, x) for x in features_train]

In [38]:
# import os
# import librosa
# import numpy as np
# import torch
# from torch.utils.data import Dataset, DataLoader

# class BeatTrackingDataset(Dataset):
#     def __init__(self, directory):
#         self.directory = directory
#         self.file_pairs = self._load_file_pairs()
#         self.data = []
#         self.labels = []
#         self._prepare_dataset()
    
#     def _load_file_pairs(self):
#         file_pairs = []
#         for file in os.listdir(self.directory):
#             if file.endswith(".wav"):
#                 wav_file = os.path.join(self.directory, file)
#                 beat_file = os.path.join(self.directory, file.replace(".wav", ".beats.gt"))
#                 if os.path.exists(beat_file):
#                     file_pairs.append((wav_file, beat_file))
#         return file_pairs

#     def _load_annotations(self, annotations_file):
#         with open(annotations_file, 'r') as f:
#             annotations = [float(line.strip().split("\t")[0]) for line in f]
#         return annotations

#     def _prepare_dataset(self):
#         for wav_file, beat_file in self.file_pairs:
#             y, sr = librosa.load(wav_file, sr=None)
#             mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
#             onset_env = librosa.onset.onset_strength(y=y, sr=sr)
#             beat_times = self._load_annotations(beat_file)
#             beat_frames = librosa.time_to_frames(beat_times, sr=sr)
#             labels = np.zeros(len(onset_env))
#             labels[beat_frames] = 1  # Mark the beats
#             self.data.append(np.vstack([mfcc, onset_env]))
#             self.labels.append(labels)
    
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)


In [34]:
# def create_dataloader(dir, batch_size=1, shuffle=True):
#     dataset = BeatTrackingDataset(dir)
#     return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [31]:
# import torch.nn as nn
# import torch.nn.functional as F

# class Chomp1d(nn.Module):
#     def __init__(self, chomp_size):
#         super(Chomp1d, self).__init__()
#         self.chomp_size = chomp_size

#     def forward(self, x):
#         return x[:, :, :-self.chomp_size]

# class TemporalBlock(nn.Module):
#     def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
#         super(TemporalBlock, self).__init__()
#         self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size,
#                                stride=stride, padding=padding, dilation=dilation)
#         self.chomp1 = Chomp1d(padding)
#         self.relu1 = nn.ReLU()
#         self.dropout1 = nn.Dropout(dropout)

#         self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size,
#                                stride=stride, padding=padding, dilation=dilation)
#         self.chomp2 = Chomp1d(padding)
#         self.relu2 = nn.ReLU()
#         self.dropout2 = nn.Dropout(dropout)

#         self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
#                                  self.conv2, self.chomp2, self.relu2, self.dropout2)
#         self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
#         self.relu = nn.ReLU()
#         self.init_weights()

#     def init_weights(self):
#         self.conv1.weight.data.normal_(0, 0.01)
#         self.conv2.weight.data.normal_(0, 0.01)
#         if self.downsample is not None:
#             self.downsample.weight.data.normal_(0, 0.01)

#     def forward(self, x):
#         out = self.net(x)
#         res = x if self.downsample is None else self.downsample(x)
#         return self.relu(out + res)

# class TemporalConvNet(nn.Module):
#     def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
#         super(TemporalConvNet, self).__init__()
#         layers = []
#         num_levels = len(num_channels)
#         for i in range(num_levels):
#             dilation_size = 2 ** i
#             in_channels = num_inputs if i == 0 else num_channels[i-1]
#             out_channels = num_channels[i]
#             layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
#                                      padding=(kernel_size-1) * dilation_size, dropout=dropout)]

#         self.network = nn.Sequential(*layers)

#     def forward(self, x):
#         return self.network(x)


In [32]:
# class BeatTrackingModel(nn.Module):
#     def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
#         super(BeatTrackingModel, self).__init__()
#         self.tcn = TemporalConvNet(num_inputs, num_channels, kernel_size, dropout)
#         self.linear = nn.Linear(num_channels[-1], 1)

#     def forward(self, x):
#         y1 = self.tcn(x)
#         o = self.linear(y1.transpose(1, 2))
#         return torch.sigmoid(o)


In [None]:
# import torch.optim as optim

# def train_model(model, dataloader, num_epochs=50, lr=0.001):
#     criterion = nn.BCELoss()
#     optimizer = optim.Adam(model.parameters(), lr=lr)

#     for epoch in range(num_epochs):
#         model.train()
#         running_loss = 0.0
#         for inputs, labels in dataloader:
#             inputs, labels = inputs.to(device), labels.to(device)

#             optimizer.zero_grad()
#             outputs = model(inputs)
#             outputs = outputs.squeeze(-1)
#             print(labels)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             optimizer.step()

#             running_loss += loss.item()
        
#         print(f"Epoch {epoch+1}, Loss: {running_loss/len(dataloader)}")

# # Example usage:
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# model = BeatTrackingModel(num_inputs=14, num_channels=[25, 25, 25], kernel_size=2, dropout=0.2).to(device)
# dataloader = create_dataloader(train_dataset_path)
# train_model(model, dataloader)


In [42]:
# def predict_beats(model, file_path):
#     y, sr = librosa.load(file_path, sr=None)
#     mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
#     onset_env = librosa.onset.onset_strength(y=y, sr=sr)
#     data = np.vstack([mfcc, onset_env])
#     data = torch.tensor(data, dtype=torch.float32).unsqueeze(0).to(device)
    
#     with torch.no_grad():
#         model.eval()
#         output = model(data)
#         output = output.squeeze().cpu().numpy()
    
#     predicted_beats = np.where(output > 0.5)[0]
#     beat_times = librosa.frames_to_time(predicted_beats, sr=sr)
#     return beat_times

# # Example usage:
# beat_times = predict_beats(model, "data/test/test02.wav")
# print(beat_times)


[]


In [None]:
# import numpy as np
# import librosa
# import math


# class BeatAgent:
#     def __init__(self, start_time, tempo, onset_times, sr, hop_length, inner_window, outer_window, score=0):
#         self.start_time = start_time
#         self.current_time = start_time
#         self.tempo = tempo
#         self.onset_times = onset_times
#         self.sr = sr
#         self.hop_length = hop_length
#         self.beat_interval = 60.0 / tempo
#         self.beat_times = [start_time]
#         self.score = score
#         self.inner_window = inner_window
#         self.outer_window = outer_window
#         self.prev_event = None

#     def update(self, duration):
#         while self.current_time < duration:
#             self.prev_event = self.current_time
#             # next_value = self.current_time + self.beat_interval
#             # nearest_event = self.__find_nearest(self.onset_times, next_value)
#             # print(type(self.onset_times))
#             # nearest_event = min(self.onset_times, key=lambda x: abs(x - next_value))
    
#             # if abs(nearest_event - next_value) <= self.inner_window and self.prev_event is not None and self.prev_event < nearest_event:
#             #     self.current_time = nearest_event
#             # else:
#             self.current_time += self.beat_interval

#             self.beat_times.append(self.current_time)
    
#     def __find_nearest(self, array, value):
#         idx = np.searchsorted(array, value, side="left")
#         if idx > 0 and (idx == len(array) or math.fabs(value - array[idx-1]) < math.fabs(value - array[idx])):
#             return array[idx-1]
#         else:
#             return array[idx]

#     def calculate_score(self):
#         beat_intervals = np.diff(self.beat_times)
#         mean_interval = np.mean(beat_intervals)
#         self.score = np.sum((beat_intervals - mean_interval) ** 2)
    
#     def process_event(self, event_time):
#         time_diff = abs(event_time - self.current_time)

#         if time_diff <= self.inner_window and self.prev_event is not None and event_time > self.prev_event:
#             self.prev_event = self.current_time
#             self.current_time = event_time
#             self.beat_times.append(event_time)
#             return True
#         if time_diff <= self.outer_window:
#             return False
#         return False
    
#     def __str__(self):
#         return "Start time: " + str(self.start_time) + ", Tempo:" + str(self.tempo)

# def multiple_agent_beat_tracking(wav_file, onsets, tempo_estimations):
#     y, sr = librosa.load(wav_file)
#     duration = librosa.get_duration(y=y, sr=sr)

#     agents = []
#     for tempo in tempo_estimations:
#         for onset_time in onsets:
#             agents.append(BeatAgent(start_time=onset_time, tempo=tempo, onset_times=onsets, sr=sr, hop_length=utils.HOP_LENGTH, inner_window=0.05, outer_window=0.1))
    
#     print(len(agents))

#     for event_time in onsets:
#         new_agents = []
#         for agent in agents:
#             if not agent.process_event(event_time):
#                 cloned_agent = BeatAgent(agent.current_time, agent.tempo, agent.onset_times, agent.sr, agent.hop_length, agent.inner_window, agent.outer_window, score=agent.score)
#                 if cloned_agent.process_event(event_time):
#                     new_agents.append(cloned_agent)
#         agents.extend(new_agents)

#     for agent in agents:
#         agent.update(duration)
#         agent.calculate_score()

#     best_agent = max(agents, key=lambda agent: agent.score)
#     print(best_agent)

#     return np.array(best_agent.beat_times)

# wav_file = "data/train/Media-105810(5.0-15.0).wav"
# # wav_file = "data/test/test48.wav"
# ft_, sr_ = utils.preprocess_audio([wav_file])
# ft_ = ft_[0]
# sr_ = sr_[0]
# # onset_ = raw_onset_signal(model, ft_, mean, std)
# # peaks, _ = scipy.signal.find_peaks(onset_, height=np.max(onset_) * 0.5)

# # onset_times = librosa.frames_to_time(peaks, sr=utils.SAMPLING_RATE, hop_length=utils.HOP_LENGTH)
# f=open("data/train/Media-105810(5.0-15.0).onsets.gt", "r")
# lines=f.readlines()
# true_onsets=[]
# for x in lines:
#     true_onsets.append(float(x.split('\t')[0]))
# f.close()

# tempo_ = get_tempo(model, ft_)
# beats_ = multiple_agent_beat_tracking(wav_file, true_onsets, tempo_)
# beats_

In [22]:
import numpy as np

class BeatAgent:
    def __init__(self, id, start_time, tempo_hypothesis, initial_tempo, inner_window, outer_window, parent_agent=None):
        self.id = str(id)
        self.start_time = start_time
        self.initial_tempo = initial_tempo
        self.tempo_hypothesis = tempo_hypothesis
        self.beat_interval = 60.0 / tempo_hypothesis
        self.next_prediction = start_time + self.beat_interval
        # self.current_time = start_time
        self.accepted_events = [start_time]
        self.inner_window = inner_window
        self.outer_window = outer_window
        self.moved = False
        self.score = 0
        # self.source = ["init"]
    
        # this means it is a sub agebt that is created from an outer_window prediction
        if parent_agent is not None:
            self.score = parent_agent.score
            self.moved = True
            self.next_prediction = parent_agent.next_prediction + self.beat_interval
            self.accepted_events = parent_agent.accepted_events.copy()
            # self.source = parent_agent.source.copy()
            self.accepted_events.append(parent_agent.next_prediction)
            # self.source.append("new agent")
    
    def process_event(self, event):
        event_diff = abs(event - self.next_prediction)
       
        if event == self.next_prediction: # the event predicted
            # print("Exact\n")
            self.accepted_events.append(event)
            # self.source.append("exact")
            self.score += self.outer_window
            # self.current_time = event
            self.next_prediction = event + self.beat_interval
            self.moved = True
            return None
        elif event_diff <= self.inner_window: # the event is inside the inner window
            # print("Inner Window\n")
            self.accepted_events.append(event)
            # self.source.append("inner")
            self.__update_tempo_hypothesis(event)
            self.score += (self.outer_window - event_diff)
            # self.current_time = event
            self.next_prediction = event + self.beat_interval
            self.moved = True
            return None
        elif event_diff <= self.outer_window: # the event is inside the outer window
            # print("Outer Window\n")
            new_agent = BeatAgent(self.id + "a", self.start_time, self. tempo_hypothesis, 
                                  self.initial_tempo, self.inner_window, self.outer_window, self)

            self.accepted_events.append(event)
            # self.source.append("outer")
            self.__update_tempo_hypothesis(event)
            self.score += (self.outer_window - event_diff)
            # self.current_time = event
            self.next_prediction = event + self.beat_interval
            self.moved = True
            return new_agent # return a new agent
        elif event > self.next_prediction + self.outer_window:
            # print("Interpolated Beat\n")
            # interpolated beat, grants no score
            self.accepted_events.append(event)
            # self.current_time = event
            self.next_prediction = event + self.beat_interval
            # self.source.append("interpolate")
            self.moved = True
            return None
        
        # print("Ignore\n")
    
    def __update_tempo_hypothesis(self, event):
        self.tempo_hypothesis = self.tempo_hypothesis + (event - self.next_prediction)
        self.beat_interval = 60.0 / self.tempo_hypothesis
    
    def __str__(self):
        return "Start time: " + str(self.start_time) + ", Tempo:" + str(self.initial_tempo)

def prune_similiar_agents(agents): # 
    agents_tempo_map = {}
    agents_to_keep = []

    for agent in agents:
        th = round(agent.tempo_hypothesis, 3)
        if not agent.moved:
            agents_to_keep.append(agent)
        elif th not in agents_tempo_map:
            agents_tempo_map[th] = agent
        elif agent.score > agents_tempo_map[th].score:
                agents_tempo_map[th] = agent

    for th in agents_tempo_map:
        agents_to_keep.append(agents_tempo_map[th])
    
    return agents_to_keep

def multiple_agent_beat_tracking(onsets, tempo_estimations):
    id = 0
    agents = []
    for tempo in tempo_estimations:
        for idx in range(0, min(10, len(onsets))):
            agents.append(BeatAgent(id, onsets[idx], tempo, tempo, 0.05, 0.1))
            id += 1
    
    # agents = [BeatAgent(onsets[0], tempo_estimations[1], tempo_estimations[1], 0.055, 0.1)]
    for event_time in onsets:
        next_agents = []
        for agent in agents:
            if agent.start_time < event_time:
                new_agent = agent.process_event(event_time)
                if new_agent is not None:
                    next_agents.append(new_agent)
        
        
        
        agents += next_agents
        agents = prune_similiar_agents(agents)

    best_agent: BeatAgent = max(agents, key=lambda agent: agent.score)
    # print(best_agent)
    # print(best_agent.id)
    # print(best_agent.source)

    return np.array(best_agent.accepted_events)

wav_file = "data/train/Media-105810(5.0-15.0).wav"
# wav_file = "data/test/test48.wav"
ft_, sr_ = utils.preprocess_audio([wav_file])
ft_ = ft_[0]
# sr_ = sr_[0]
# onset_ = raw_onset_signal(model, ft_, mean, std)

# import scipy
# peaks, _ = scipy.signal.find_peaks(onset_, height=np.max(onset_) * 0.5)

# onset_times = librosa.frames_to_time(peaks, sr=utils.SAMPLING_RATE, hop_length=utils.HOP_LENGTH)

###### used the ground truth onsets in order to not blame outside factors
f=open("data/train/Media-105810(5.0-15.0).onsets.gt", "r")
f=open("data/train/train20.onsets.gt", "r")
lines=f.readlines()
true_onsets=[]
for x in lines:
    true_onsets.append(float(x.split('\t')[0]))
f.close()
######

tempo_ = get_tempo(model, ft_)
beats_ = multiple_agent_beat_tracking(true_onsets, tempo_)
beats_

100%|██████████| 1/1 [00:00<00:00, 39.49it/s]


array([ 0.07764172,  0.36208617,  0.65088435,  0.98776318,  1.33297052,
        1.66983359,  2.01      ,  2.29369615,  2.63065354,  2.95764172,
        3.29461798,  3.64      ,  3.97696035,  4.34068027,  4.67758999,
        5.00244898,  5.3393815 ,  5.67219955,  5.96      ,  6.29703331,
        6.6430839 ,  6.93      ,  7.26711111,  7.58131519,  7.870839  ,
        8.16979592,  8.45786848,  8.75      ,  9.11      ,  9.44745399,
        9.78213152, 10.11959078, 10.46      , 10.74      , 11.07756274,
       11.40027211, 11.68      , 12.01770089, 12.34358277, 12.61424036,
       12.88707483, 13.17442177, 13.51249281, 13.85      , 14.10104308,
       14.41160998, 14.74990072, 15.09      , 15.42828729, 15.69      ,
       16.03      , 16.36843042, 16.72      , 17.07      , 17.40838321,
       17.72843537, 18.06685357, 18.39      , 18.65      , 18.94      ,
       19.27869007, 19.60780045, 19.94650883, 20.24562358, 20.58440769,
       20.92408163, 21.23      , 21.56884528, 21.9       , 22.19

In [23]:
features_train, sample_rates_train = utils.preprocess_audio(wav_files_paths_train)

100%|██████████| 126/126 [00:04<00:00, 28.72it/s]


In [None]:
beatsT = {}
tempos = {}
for _, (ft, wav_file) in enumerate(zip(features_train, wav_files_paths_train)):
    filename = wav_file.split('/')[-1].replace('.wav', '')
    # on = get_onset_preds(model, ft, mean, std)
    f=open(wav_file.replace(".wav", ".onsets.gt"), "r")
    lines=f.readlines()
    true_onsets=[]
    for x in lines:
        true_onsets.append(float(x))
    f.close()
    temp = get_tempo(model, ft)
    beatsT[filename] = multiple_agent_beat_tracking(true_onsets, temp)

In [None]:
pred = {}
if "./data/train/ff123_bloodline.wav" in wav_files_paths_train:
    wav_files_paths_train.remove("./data/train/ff123_bloodline.wav")

for filename in wav_files_paths_train:
    filename = filename.split('/')[-1].replace('.wav', '')
    pred[filename] = {'beats': beatsT[filename]}

In [26]:
target = {}
if "./data/train/ff123_bloodline.wav" in wav_files_paths_train:
    wav_files_paths_train.remove("./data/train/ff123_bloodline.wav")

for filename in wav_files_paths_train:
    wav_f = filename.split('/')[-1].replace('.wav', '')
    beat_filename = filename.replace(".wav", ".beats.gt")
    f=open(beat_filename, "r")
    lines=f.readlines()
    result=[]
    for x in lines:
        result.append(float(x.split('\t')[0]))
    f.close()
    target[wav_f] = {'beats': result}

In [27]:
def evaluate_loop(submission, target):
    sum_f = 0.
    for target_key, target_value in target.items():
        if target_key in submission:
            reference = target_value['beats']
            estimated = submission[target_key]['beats']
            f = mir_eval.beat.f_measure(
                np.array(reference),
                np.array(estimated),
                f_measure_threshold=0.07  # 70 [ms]
            )
        else:
            f = 0.

        sum_f += f
    return sum_f / len(target)


print(evaluate_loop(pred, target))

0.4899458469356703


In [None]:
import json
file_path = 'beats.json'

# Open the file in write mode and save the dictionary
with open(file_path, 'w') as f:
    json.dump(pred, f, indent=4)