In [3]:
# FUNCTIONS

# for MIR-QBSH query
def read_semitone_from_MIR_query(dir_query):
    list_query_name = []
    df_query = []
    truth = []

    for file in os.listdir(dir_query):
        if file.endswith(".csv"):
            list_query_name.append(file.replace(".csv",""))
            file_path = os.path.join(dir_query, file)
            fields = ["semitone"]
            temp_df = pd.read_csv(file_path, usecols=fields)
            df_query.append(temp_df.to_numpy().flatten())
            truth.append(file.split("-")[2])

    return list_query_name, df_query, truth

# for MIR-QBSH manually labelled query
def read_semitone_from_MIR_manual_query(dir_query):
    list_query_name = []
    df_query = []
    truth = []

    for file in os.listdir(dir_query):
        if file.endswith(".pv"):
            list_query_name.append(file.replace(".pv",""))
            file_path = os.path.join(dir_query, file)
            temp_df = pd.read_csv(file_path)
            temp_df = temp_df.to_numpy().flatten()
            round_df = []
            for i in temp_df:
                if i!=0:
                    round_df.append(round(i))
            df_query.append(round_df)
            truth.append(file.replace(".pv","").split("-")[2])

    return list_query_name, df_query, truth

# for IOACAS_QBH query
def read_semitone_from_IOACAS_query(dir_query):
    list_query_name = []
    df_query = []
    truth = []

    truth_file = os.path.join(dir_query, "query_truth.list")
    truth_df = pd.read_csv(truth_file, header=None, sep="\t")
    all_truth = truth_df[1].to_numpy().flatten()
    all_wav = []

    for wav in truth_df[0]:
        all_wav.append(wav.split("\\")[1].replace(".wav",""))

    for file in os.listdir(dir_query):
        if file.endswith(".csv"):
            list_query_name.append(file.replace(".csv",""))
            file_path = os.path.join(dir_query, file)
            fields = ["semitone"]
            temp_df = pd.read_csv(file_path, usecols=fields)
            df_query.append(temp_df.to_numpy().flatten())
            # index_truth = all_wav.index("002_010")
            index_truth = all_wav.index(file.split("-")[0])
            truth.append(str(all_truth[index_truth]))
    
    return list_query_name, df_query, truth

# for MIR-QBSH and IOACAS-QBH database
def read_note_from_midi(dir_midi):
    list_dir_midi = []
    df_midi = []

    for file in os.listdir(dir_midi):
        if file.endswith(".csv"):
            list_dir_midi.append(file.replace(".mid.csv",""))
            file_path = os.path.join(dir_midi, file)
            fields = ["note_index"]
            temp_df = pd.read_csv(file_path, usecols=fields)
            df_midi.append(temp_df.to_numpy().flatten())

    return list_dir_midi, df_midi

# calculate relative distance
def calc_rel_dis(df):
    res = []
    length = len(df)
    for i in range(length-1):
        dis = float(df[i+1] - df[i])
        res.append(dis)
    res = remove_consecutive(res)
    return res

# remove consecutive distance in list of relative distance 
def remove_consecutive(list):
    i = 0
    while i < len(list)-1:
        if list[i] == list[i+1]:
            del list[i]
        else:
            i = i+1
    return list

# arrange index according to its similarity, more similar less 
def get_all_rank(ls):
    res = []
    for i in ls:
        temp = ls.index(max(ls))
        res.append(list_dir_midi[temp])
        ls[temp] = 0
    return res

# get rank from ground truth
def get_rank(rank, truth):
    return rank.index(truth)+1

# get top ten rank
def get_top_ten(rank):
    return rank[:10]

# count MRR from a list of rank from some queries
def count_MRR(list_rank):
    mrr = 0
    for i in list_rank:
        mrr+=1/i
    return round(mrr/len(list_rank),2)

In [4]:
# read midi and query data
# insert it into dataframe

import os
import pandas as pd
import edit_distance
import time

folder_data = "Data query dan midi"

# test query
dir_query = os.path.join(folder_data, "Test query IOACAS_QBH")
# dir_query = os.path.join(folder_data, "Test query IOACAS_QBH_DNN-LSTM")
# dir_query = os.path.join(folder_data, "Test query MIR-QBSH")
# dir_query = os.path.join(folder_data, "Test query MIR-QBSH_DNN-LSTM")
# dir_query = os.path.join(folder_data, "Test query MIR-QBSH_manual label")

# all query
# dir_query = os.path.join(folder_data, "Query_IOACAS_QBH")
# dir_query = os.path.join(folder_data, "Query_IOACAS_QBH_DNN-LSTM")
# dir_query = os.path.join(folder_data, "Query_MIR_QBSH")
# dir_query = os.path.join(folder_data, "Query_MIR_QBSH_DNN-LSTM")
# dir_query = os.path.join(folder_data, "Query_MIR_QBSH_manual label")

# all midi
# dir_midi = os.path.join(folder_data, "Database midi MIR-QBSH")
dir_midi = os.path.join(folder_data, "Database midi IOACAS-QBH")


# read and insert semitone from query to list of dataframe
if "manual" in dir_query:
    list_query_name, df_query, truth = read_semitone_from_MIR_manual_query(dir_query)
elif "IOACAS" in dir_query:
    list_query_name, df_query, truth = read_semitone_from_IOACAS_query(dir_query)
else:
    list_query_name, df_query, truth = read_semitone_from_MIR_query(dir_query)

# read and insert note_index or semitone from midi to list of dataframe
list_dir_midi, df_midi = read_note_from_midi(dir_midi)

# calculate relative distance in all query
dis_query = []
for query in df_query:
    dis_query.append(calc_rel_dis(query))

# calculate relative distance in all midi
dis_midi = []
for midi in df_midi:
    dis_midi.append(calc_rel_dis(midi))

# ref = dis_midi
# hyp = dis_query

list_ratio = []
list_rank = []
list_time = []

# test match with query data
for i in range(len(dis_query)):
    start_time = time.time()
    hyp = dis_query[i]
    list_ratio = []
    for ref in dis_midi:
        sm = edit_distance.SequenceMatcher(a=ref, b=hyp)
        list_ratio.append(sm.ratio())
    
    rank = get_all_rank(list_ratio)
    rankTruth = get_rank(rank, truth[i])
    topTen = get_top_ten(rank)

    list_rank.append(rankTruth)

    # print("query",list_query_name[i],"top ten is",topTen)
    print("query",list_query_name[i],"truth is on rank",rankTruth)

    list_time.append(round(time.time() - start_time, 3))

print("MRR:", count_MRR(list_rank))
print("Avg time:", round(sum(list_time)/len(list_time), 3))

query 001_001-parselmouth truth is on rank 119
query 001_002-parselmouth truth is on rank 138
MRR: 0.01
Avg time: 55.9


In [3]:
# create inverted index for 2-grams, 3-grams, 4-grams in all midi

import hashedindex
index = hashedindex.HashedIndex()

# example:
# midi relative distance : +1 +1 +4
# 2-grams are +1 and +4
# 3-grams are +1 +1 and +1 +3
# 4-grams is +1 +1 +4

# print(dis_midi[0])

# inserting 2-grams as inverted index
for midiNumber in range(len(dis_midi)):
    for note in dis_midi[midiNumber]:
        index.add_term_occurrence(note, list_dir_midi[midiNumber])

# test 2-grams
# index.get_documents((2.0))

# inserting 3-grams as inverted index
for midiNumber in range(len(dis_midi)):
    for noteNumber in range(len(dis_midi[midiNumber])-1):
        term = (dis_midi[midiNumber][noteNumber], dis_midi[midiNumber][noteNumber+1])
        index.add_term_occurrence(term, list_dir_midi[midiNumber])

# test 3-grams
# index.get_documents((2.0, 0.0))

# inserting 4-grams as inverted index
for midiNumber in range(len(dis_midi)):
    for noteNumber in range(len(dis_midi[midiNumber])-2):
        term = (dis_midi[midiNumber][noteNumber], dis_midi[midiNumber][noteNumber+1], dis_midi[midiNumber][noteNumber+2])
        index.add_term_occurrence(term, list_dir_midi[midiNumber])

# test 4-grams
# index.get_documents((2.0, 0.0, -2.0))