In [22]:
# read midi and query data
# insert it into dataframe

import os
import pandas as pd

dir_query = "Data query MIR-QBSH"
dir_midi = "Database midi MIR-QBSH"
dir_query_DNN = os.path.join(dir_query, "DNN-LSTM")

# to be used as index in inverted index database
list_dir_midi = []

df_query = []
df_midi = []
df_query_DNN = []
    
# read and insert semitone from query to list of dataframe
for file in os.listdir(dir_query):
    if file.endswith(".csv"):
        file_path = os.path.join(dir_query, file)
        fields = ["semitone"]
        temp_df = pd.read_csv(file_path, usecols=fields)
        df_query.append(temp_df.to_numpy().flatten())

# read and insert note_index or semitone from midi to list of dataframe
for file in os.listdir(dir_midi):
    if file.endswith(".csv"):
        list_dir_midi.append(file.replace(".mid.csv",""))
        file_path = os.path.join(dir_midi, file)
        fields = ["note_index"]
        temp_df = pd.read_csv(file_path, usecols=fields)
        df_midi.append(temp_df.to_numpy().flatten())

import math
def convert_f0_to_semitone(freq):
    res = round(12*math.log((freq/440),2)+69)
    return res

# read f0 from extracted melody (DNN-LSTM) and convert it to semitone then insert to list of dataframe
for file in os.listdir(dir_query_DNN):
    if file.endswith(".csv"):
        file_path = os.path.join(dir_query_DNN, file)
        fields = [1]
        temp_df = pd.read_csv(file_path, header=None, sep="\t", usecols=fields)
        temp_arr = temp_df.to_numpy().flatten()
        res_arr = []
        for freq in temp_arr:
            if freq != 0.0:
                res_arr.append(convert_f0_to_semitone(freq))
        df_query_DNN.append(res_arr)

In [24]:
# calculate relative distance
def calc_rel_dis(df):
    res = []
    length = len(df)
    for i in range(length-1):
        dis = float(df[i+1] - df[i])
        res.append(dis)
    res = remove_consecutive(res)
    return res

# remove consecutive distance in list of relative distance 
def remove_consecutive(list):
    i = 0
    while i < len(list)-1:
        if list[i] == list[i+1]:
            del list[i]
        else:
            i = i+1
    return list

# calculate relative distance in all query
dis_query = []
for query in df_query:
    dis_query.append(calc_rel_dis(query))

# calculate relative distance in all midi
dis_midi = []
for midi in df_midi:
    dis_midi.append(calc_rel_dis(midi))

# calculate relative distance in query from melody extraction DNN-LSTM
dis_query_DNN = []
for query in df_query_DNN:
    dis_query_DNN.append(calc_rel_dis(query))

# print(dis_query)
# print(dis_midi)
# print(dis_query_DNN)

In [25]:
# create inverted index for 2-grams, 3-grams, 4-grams in all midi

import hashedindex
index = hashedindex.HashedIndex()

# example:
# midi relative distance : +1 +1 +4
# 2-grams are +1 and +4
# 3-grams are +1 +1 and +1 +3
# 4-grams is +1 +1 +4

# print(dis_midi[0])

# inserting 2-grams as inverted index
for midiNumber in range(len(dis_midi)):
    for note in dis_midi[midiNumber]:
        index.add_term_occurrence(note, list_dir_midi[midiNumber])

# test 2-grams
index.get_documents((2.0))

# inserting 3-grams as inverted index
for midiNumber in range(len(dis_midi)):
    for noteNumber in range(len(dis_midi[midiNumber])-1):
        term = (dis_midi[midiNumber][noteNumber], dis_midi[midiNumber][noteNumber+1])
        index.add_term_occurrence(term, list_dir_midi[midiNumber])

# test 3-grams
index.get_documents((2.0, 0.0))

# inserting 4-grams as inverted index
for midiNumber in range(len(dis_midi)):
    for noteNumber in range(len(dis_midi[midiNumber])-2):
        term = (dis_midi[midiNumber][noteNumber], dis_midi[midiNumber][noteNumber+1], dis_midi[midiNumber][noteNumber+2])
        index.add_term_occurrence(term, list_dir_midi[midiNumber])

# test 4-grams
index.get_documents((2.0, 0.0, -2.0))

Counter({'00001': 4,
         '00005': 2,
         '00014': 2,
         '00015': 2,
         '00017': 3,
         '00019': 4,
         '00023': 4,
         '00025': 1,
         '00027': 1,
         '00028': 4,
         '00029': 1,
         '00032': 6,
         '00036': 2,
         '00037': 2,
         '00038': 3,
         '00043': 2})

In [26]:
import edit_distance

# ref = dis_midi
# hyp = dis_query

counter = 0
list_ratio = []

def get_top_ten(ls):
    res = []
    for i in range(10):
        temp = ls.index(max(ls))
        res.append(temp)
        ls.pop(temp)
    return res

# test match with query data extracted with PRAAT on various parameter setting
for i in range(len(dis_query)):
    hyp = dis_query[i]
    list_ratio = []
    for ref in dis_midi:
        sm = edit_distance.SequenceMatcher(a=ref, b=hyp)
        list_ratio.append(sm.ratio())
    res = get_top_ten(list_ratio)
    print("query PRAAT",i,"top ten is",res)

# test match with query data extracted with DNN-LSTM
for i in range(len(dis_query_DNN)):
    hyp = dis_query_DNN[i]
    list_ratio = []
    for ref in dis_midi:
        sm = edit_distance.SequenceMatcher(a=ref, b=hyp)
        list_ratio.append(sm.ratio())
    res = get_top_ten(list_ratio)
    print("query DNN-LSTM",i,"top ten is",res)

# print(list_ratio)
# print(max(list_ratio))
# print(list_ratio.index(max(list_ratio)))

query PRAAT 0 top ten is [45, 31, 22, 43, 13, 5, 24, 23, 31, 27]
query PRAAT 1 top ten is [45, 31, 22, 43, 13, 24, 5, 23, 31, 27]
query PRAAT 2 top ten is [45, 31, 22, 13, 25, 41, 24, 5, 31, 27]
query PRAAT 3 top ten is [45, 22, 30, 31, 25, 5, 24, 39, 12, 30]
query PRAAT 4 top ten is [45, 22, 30, 43, 31, 26, 25, 32, 13, 5]
query PRAAT 5 top ten is [45, 22, 30, 31, 25, 25, 40, 5, 12, 30]
query DNN-LSTM 0 top ten is [45, 5, 12, 43, 29, 24, 24, 4, 19, 26]
query DNN-LSTM 1 top ten is [45, 31, 13, 5, 18, 19, 22, 20, 38, 26]
