In [1]:
import sys
import os
if os.path.abspath('../sample') not in sys.path:
    sys.path.insert(0, os.path.abspath('../sample'))
if os.path.abspath('../model') not in sys.path:
    sys.path.insert(0, os.path.abspath('../model'))

import msaf

import overall_scripts as scr
import data_manipulation as dm
import pandas as pd
import numpy as np
pd.set_option('precision', 4)

In this notebook are presented the results of the baseline, computed with MSAF.

We restricted the baseline to three algorithms:
 - CNMF [1],
 - Foote's novelty [2],
 - Sepctral Clustering [3].

In [2]:
desired_algos = ["cnmf", "foote", "scluster"]

In [4]:
folder = "C:\\Users\\amarmore\\Desktop\\Audio samples\\RWC Pop\\Entire RWC"
annotations_type = "MIREX10"
annotations_folder = "C:\\Users\\amarmore\\Desktop\\Audio samples\\RWC Pop\\annotations\\{}\\".format(annotations_type)

Below is the code to compute these scores.

In [3]:
def parse_all_algos(song_path, references_segments, bars):
    # Function which computes all frontiers for this song by the desired algorithms,
    # and then computes segmentation scores.
    zero_five_to_return = []
    three_to_return = []
    
    five_rates = []
    three_rates = []

    for algo in desired_algos:
        boundaries, _ = msaf.process(song_path, boundaries_id=algo)
        segments = np.array(dm.frontiers_to_segments(boundaries))

        zero_five_to_return.append(dm.compute_score_of_segmentation(
            references_segments, segments, window_length=0.5))
        three_to_return.append(dm.compute_score_of_segmentation(
            references_segments, segments, window_length=3))
        
        five_rates.append(dm.compute_rates_of_segmentation(
            references_segments, segments, window_length=0.5))
        original_rates = dm.compute_rates_of_segmentation(
            references_segments, segments, window_length=3)
        three_rates.append(original_rates)
        
        aligned_segments = dm.align_segments_on_bars(segments, bars)
        zero_five_to_return.append(dm.compute_score_of_segmentation(
            references_segments, aligned_segments, window_length=0.5))
        three_to_return.append(dm.compute_score_of_segmentation(
            references_segments, aligned_segments, window_length=3))
        
        five_rates.append(dm.compute_rates_of_segmentation(
            references_segments, aligned_segments, window_length=0.5))
        aligned_rates = dm.compute_rates_of_segmentation(
            references_segments, aligned_segments, window_length=3)
        three_rates.append(aligned_rates)
        
    return zero_five_to_return, three_to_return, five_rates, three_rates

In [1]:
# Script which parses all songs of RWC, computes its frontiers for all algorithms, and then 
zero_point_five_results = []
three_seconds_results = []
five_rates_results = []
three_rates_results = []

paths = scr.load_RWC_dataset(folder, annotations_type = annotations_type)
persisted_path = "C:\\Users\\amarmore\\Desktop\\data_persisted\\"

for song_and_annotations in paths:
    song_path = folder + "\\" + song_and_annotations[0]
    print(song_and_annotations[0])

    annot_path = annotations_folder + song_and_annotations[1]
    annotations = dm.get_segmentation_from_txt(annot_path, annotations_type)
    references_segments = np.array(annotations)[:,0:2]
    
    bars = scr.load_bars(persisted_path, song_and_annotations[0].replace(".wav",""))
    this_zero, this_three, five_rates, three_rates = parse_all_algos(song_path, references_segments, bars)

    zero_point_five_results.append(this_zero)
    three_seconds_results.append(this_three)
    
    five_rates_results.append(five_rates)
    three_rates_results.append(three_rates)

In [6]:
zerofive = np.array(zero_point_five_results)
three = np.array(three_seconds_results)

all_algos = [alg for alg in desired_algos]

params = ['Original', 'Aligned on downbeats']
line = []
subline = []
for i in all_algos:
    for j in params:
        line.append(i)
        subline.append(j)
arr = []
col = [np.array(['0.5 seconds','0.5 seconds','0.5 seconds','3 seconds','3 seconds','3 seconds']),
    np.array(['Precision', 'Recall', 'F measure','Precision', 'Recall', 'F measure'])]

nested_lines = [np.array(line), np.array(subline)]

for i in range(len(line)):
    arr.append([np.mean(zerofive[:,i,0]),np.mean(zerofive[:,i,1]), np.mean(zerofive[:,i,2]),
            np.mean(three[:,i,0]),np.mean(three[:,i,1]), np.mean(three[:,i,2])])

pd.DataFrame(np.array(arr), index=nested_lines, columns=col)

Unnamed: 0_level_0,Unnamed: 1_level_0,0.5 seconds,0.5 seconds,0.5 seconds,3 seconds,3 seconds,3 seconds
Unnamed: 0_level_1,Unnamed: 1_level_1,Precision,Recall,F measure,Precision,Recall,F measure
cnmf,Original,0.2284,0.2146,0.2152,0.4676,0.4517,0.4469
cnmf,Aligned on downbeats,0.3157,0.2811,0.2881,0.5068,0.4537,0.4653
foote,Original,0.2965,0.223,0.2514,0.6389,0.4859,0.5449
foote,Aligned on downbeats,0.4203,0.2995,0.3448,0.6706,0.4766,0.5501
scluster,Original,0.3123,0.3045,0.2944,0.6065,0.6084,0.5812
scluster,Aligned on downbeats,0.4921,0.4503,0.4501,0.6554,0.6056,0.603


In [7]:
zerofive = np.array(five_rates_results)
three = np.array(three_rates_results)

all_algos = [alg for alg in desired_algos]

params = ['Original', 'Aligned on downbeats']
line = []
subline = []
for i in all_algos:
    for j in params:
        line.append(i)
        subline.append(j)
arr = []
col = [np.array(['0.5 seconds','0.5 seconds','0.5 seconds','3 seconds','3 seconds','3 seconds']),
    np.array(['TP', 'FP', 'FN','TP', 'FP', 'FN'])]

nested_lines = [np.array(line), np.array(subline)]

for i in range(len(line)):
    arr.append([np.mean(zerofive[:,i,0]),np.mean(zerofive[:,i,1]), np.mean(zerofive[:,i,2]),
            np.mean(three[:,i,0]),np.mean(three[:,i,1]), np.mean(three[:,i,2])])

pd.DataFrame(np.array(arr), index=nested_lines, columns=col)

Unnamed: 0_level_0,Unnamed: 1_level_0,0.5 seconds,0.5 seconds,0.5 seconds,3 seconds,3 seconds,3 seconds
Unnamed: 0_level_1,Unnamed: 1_level_1,TP,FP,FN,TP,FP,FN
cnmf,Original,3.96,14.99,14.85,8.49,10.46,10.32
cnmf,Aligned on downbeats,5.22,12.52,13.59,8.49,9.25,10.32
foote,Original,4.16,10.23,14.65,9.13,5.26,9.68
foote,Aligned on downbeats,5.58,7.93,13.23,8.92,4.59,9.89
scluster,Original,5.71,15.18,13.1,11.37,9.52,7.44
scluster,Aligned on downbeats,8.4,10.85,10.41,11.29,7.96,7.52


In [8]:
# Results on all dataframe on AIST annotations
a = [[0.20186862214734336, 0.18139564043366033, 0.1830411815179583, 0.46862395942965773, 0.4366175604196435, 0.43346937034328387], [0.24980960734838084, 0.23978915917754587, 0.23557080631425656, 0.45848130838215917, 0.44478809340851344, 0.4353684372544839], [0.2564757675245291, 0.1809890098619268, 0.20838316045420405, 0.6464279778099439, 0.4682207797709985, 0.5331560155987777], [0.31329167872816077, 0.24336851057887532, 0.26935347999993325, 0.6001700324641334, 0.4664109003509763, 0.5161388533581156], [0.28168488560559163, 0.2539043758497925, 0.25324629708941093, 0.6140897607452479, 0.5980661310846564, 0.5754178377879772], [0.3803138010597339, 0.3695549783650638, 0.3594369001849247, 0.5952420795968422, 0.5944136047089068, 0.5694506859212222]]

In [9]:
pd.DataFrame(np.array(a), index=nested_lines, columns=col)

Unnamed: 0_level_0,Unnamed: 1_level_0,0.5 seconds,0.5 seconds,0.5 seconds,3 seconds,3 seconds,3 seconds
Unnamed: 0_level_1,Unnamed: 1_level_1,TP,FP,FN,TP,FP,FN
cnmf,Original,0.2019,0.1814,0.183,0.4686,0.4366,0.4335
cnmf,Aligned on downbeats,0.2498,0.2398,0.2356,0.4585,0.4448,0.4354
foote,Original,0.2565,0.181,0.2084,0.6464,0.4682,0.5332
foote,Aligned on downbeats,0.3133,0.2434,0.2694,0.6002,0.4664,0.5161
scluster,Original,0.2817,0.2539,0.2532,0.6141,0.5981,0.5754
scluster,Aligned on downbeats,0.3803,0.3696,0.3594,0.5952,0.5944,0.5695


In [10]:
np.std(np.array(zero_point_five_results)[:,2,2])

0.06842143876723239

# References

[1] Nieto, O., & Jehan, T. (2013, May). Convex non-negative matrix factorization for automatic music structure identification. In 2013 IEEE International Conference on Acoustics, Speech and Signal Processing (pp. 236-240). IEEE.

[2] Foote, J. (2000, July). Automatic audio segmentation using a measure of audio novelty. In 2000 IEEE International Conference on Multimedia and Expo. ICME2000. Proceedings. Latest Advances in the Fast Changing World of Multimedia (Cat. No. 00TH8532) (Vol. 1, pp. 452-455). IEEE.

[3] McFee, B., & Ellis, D. (2014). Analyzing Song Structure with Spectral Clustering. In ISMIR (pp. 405-410).