In [20]:
import pandas as pd
import hyperloglog
import copy

In [43]:
queries_sets_df = pd.read_pickle("./queries_sets.pkl")
songs_sets_df = pd.read_pickle("./songs_sets.pkl")

In [4]:
cols = queries_sets_df.columns
set_cols = list(filter(lambda x: x.endswith("_set"), cols))
set_cols

['pitch_diff-ngram_5_set',
 'UpSameDown-ngram_5_set',
 'pitch_diff-ngram_8_set',
 'UpSameDown-ngram_8_set',
 'pitch_diff-ngram_10_set',
 'UpSameDown-ngram_10_set',
 'pitch_diff-ngram_12_set',
 'UpSameDown-ngram_12_set',
 'pitch_diff-multi_gram_5_10_set',
 'UpSameDown-multi_gram_5_10_set']

In [5]:
def create_hll_from_set(l, error_rate):
    hll = hyperloglog.HyperLogLog(error_rate = error_rate)
    for item in l:
        hll.add(item)
    return hll

In [6]:
hll_config_list = [0.26, 0.065, 0.018, 0.005]
hll_p_list = []

In [7]:
for error_rate in hll_config_list:
    p = hyperloglog.HyperLogLog(error_rate).p
    hll_p_list.append(p)
    for set_col in set_cols:
        hll_col = f"{set_col}_hll_{p}"
        queries_sets_df[hll_col] = queries_sets_df[set_col].apply(lambda x: create_hll_from_set(x,error_rate))
        songs_sets_df[hll_col] = songs_sets_df[set_col].apply(lambda x: create_hll_from_set(x,error_rate))

In [8]:
queries_sets_df.head(2)

Unnamed: 0,Query ID,Song ID,pitch_diff-ngram_5_set,UpSameDown-ngram_5_set,pitch_diff-ngram_8_set,UpSameDown-ngram_8_set,pitch_diff-ngram_10_set,UpSameDown-ngram_10_set,pitch_diff-ngram_12_set,UpSameDown-ngram_12_set,...,pitch_diff-ngram_5_set_hll_16,UpSameDown-ngram_5_set_hll_16,pitch_diff-ngram_8_set_hll_16,UpSameDown-ngram_8_set_hll_16,pitch_diff-ngram_10_set_hll_16,UpSameDown-ngram_10_set_hll_16,pitch_diff-ngram_12_set_hll_16,UpSameDown-ngram_12_set_hll_16,pitch_diff-multi_gram_5_10_set_hll_16,UpSameDown-multi_gram_5_10_set_hll_16
0,q1,1118,"{(-1, 0, -2, -2, -3), (-2, 5, -1, 0, -2), (-3,...","{('D', 'U', 'D', 'S', 'D'), ('D', 'D', 'D', 'S...","{(-2, 5, -1, 0, -2, -2, -3, 0), (2, 0, -2, 5, ...","{('S', 'D', 'D', 'D', 'S', 'U', 'U', 'U'), ('S...","{(-2, 5, -1, 0, -2, -2, -3, 0, 3, 2), (2, 0, -...","{('D', 'U', 'D', 'S', 'D', 'D', 'D', 'S', 'U',...","{(0, -2, 5, -1, 0, -2, -2, -3, 0, 3, 2, 3), (2...","{('S', 'D', 'U', 'D', 'S', 'D', 'D', 'D', 'S',...",...,<hyperloglog.hll.HyperLogLog object at 0x15f8b...,<hyperloglog.hll.HyperLogLog object at 0x15f8d...,<hyperloglog.hll.HyperLogLog object at 0x15f8e...,<hyperloglog.hll.HyperLogLog object at 0x15f8e...,<hyperloglog.hll.HyperLogLog object at 0x15f92...,<hyperloglog.hll.HyperLogLog object at 0x15f94...,<hyperloglog.hll.HyperLogLog object at 0x15f95...,<hyperloglog.hll.HyperLogLog object at 0x15f97...,<hyperloglog.hll.HyperLogLog object at 0x15f99...,<hyperloglog.hll.HyperLogLog object at 0x15f9b...
1,q2,1438,"{(-1, -2, 10, 2, -2), (2, -2, -4, -2, 2), (1, ...","{('D', 'U', 'U', 'D', 'D'), ('D', 'D', 'D', 'D...","{(-4, -2, 2, 6, -6, -2, -2, -2), (2, -2, -4, -...","{('D', 'U', 'U', 'D', 'D', 'D', 'U', 'U'), ('U...","{(-2, -4, -2, 2, 6, -6, -2, -2, -2, 0), (1, -1...","{('U', 'U', 'D', 'D', 'D', 'U', 'U', 'D', 'D',...","{(-2, -4, -2, 2, 6, -6, -2, -2, -2, 0, 2, -2),...","{('D', 'U', 'U', 'D', 'D', 'D', 'U', 'U', 'D',...",...,<hyperloglog.hll.HyperLogLog object at 0x15f8e...,<hyperloglog.hll.HyperLogLog object at 0x15f90...,<hyperloglog.hll.HyperLogLog object at 0x15f92...,<hyperloglog.hll.HyperLogLog object at 0x15f94...,<hyperloglog.hll.HyperLogLog object at 0x15f95...,<hyperloglog.hll.HyperLogLog object at 0x15f97...,<hyperloglog.hll.HyperLogLog object at 0x15f99...,<hyperloglog.hll.HyperLogLog object at 0x15f9b...,<hyperloglog.hll.HyperLogLog object at 0x15f9d...,<hyperloglog.hll.HyperLogLog object at 0x15f9e...


In [9]:
songs_sets_df.head(2)

Unnamed: 0,Song ID,pitch_diff-ngram_5_set,UpSameDown-ngram_5_set,pitch_diff-ngram_8_set,UpSameDown-ngram_8_set,pitch_diff-ngram_10_set,UpSameDown-ngram_10_set,pitch_diff-ngram_12_set,UpSameDown-ngram_12_set,pitch_diff-multi_gram_5_10_set,...,pitch_diff-ngram_5_set_hll_16,UpSameDown-ngram_5_set_hll_16,pitch_diff-ngram_8_set_hll_16,UpSameDown-ngram_8_set_hll_16,pitch_diff-ngram_10_set_hll_16,UpSameDown-ngram_10_set_hll_16,pitch_diff-ngram_12_set_hll_16,UpSameDown-ngram_12_set_hll_16,pitch_diff-multi_gram_5_10_set_hll_16,UpSameDown-multi_gram_5_10_set_hll_16
0,1,"{(2, 0, 1, -1, 1), (0, 1, 6, -3, 0), (1, 12, -...","{('D', 'U', 'U', 'D', 'D'), ('U', 'S', 'U', 'D...","{(2, -16, -4, 11, -11, 0, 11, 5), (-1, 1, 0, -...","{('D', 'D', 'U', 'D', 'U', 'D', 'S', 'D'), ('U...","{(6, 0, -2, 3, -3, 2, -1, -1, 2, -2), (9, -12,...","{('D', 'D', 'U', 'D', 'U', 'U', 'D', 'U', 'S',...","{(3, 0, -1, 2, 0, -1, 3, -1, 2, -2, 0, -1), (-...","{('U', 'S', 'D', 'U', 'S', 'D', 'D', 'U', 'D',...","{(-2, 1, 1, -1, 1, 0, -2), (-3, 2, -1, -1, 2),...",...,<hyperloglog.hll.HyperLogLog object at 0x15f8b...,<hyperloglog.hll.HyperLogLog object at 0x15f8d...,<hyperloglog.hll.HyperLogLog object at 0x15f8f...,<hyperloglog.hll.HyperLogLog object at 0x15f90...,<hyperloglog.hll.HyperLogLog object at 0x15f92...,<hyperloglog.hll.HyperLogLog object at 0x15f94...,<hyperloglog.hll.HyperLogLog object at 0x15f96...,<hyperloglog.hll.HyperLogLog object at 0x15f97...,<hyperloglog.hll.HyperLogLog object at 0x15f99...,<hyperloglog.hll.HyperLogLog object at 0x15f9d...
1,4,"{(24, -24, -9, 9, -15), (0, 4, 0, 2, 0), (16, ...","{('D', 'U', 'U', 'D', 'D'), ('U', 'S', 'U', 'D...","{(8, -8, 8, -8, 2, 17, 0, -19), (0, -5, -14, 0...","{('D', 'U', 'D', 'S', 'U', 'D', 'U', 'S'), ('D...","{(28, -24, 24, -24, -9, 9, -15, 15, -16, 7), (...","{('D', 'U', 'U', 'S', 'D', 'S', 'D', 'S', 'U',...","{(-2, 0, -1, 0, 0, 0, 1, -1, 0, 0, 5, -33), (0...","{('U', 'D', 'S', 'S', 'S', 'S', 'D', 'U', 'U',...","{(-35, 7, 9, 19, 0, -5, -14, 0, -16), (24, -24...",...,<hyperloglog.hll.HyperLogLog object at 0x15f8e...,<hyperloglog.hll.HyperLogLog object at 0x15f90...,<hyperloglog.hll.HyperLogLog object at 0x15f92...,<hyperloglog.hll.HyperLogLog object at 0x15f94...,<hyperloglog.hll.HyperLogLog object at 0x15f96...,<hyperloglog.hll.HyperLogLog object at 0x15f97...,<hyperloglog.hll.HyperLogLog object at 0x15f99...,<hyperloglog.hll.HyperLogLog object at 0x15f9b...,<hyperloglog.hll.HyperLogLog object at 0x15f9d...,<hyperloglog.hll.HyperLogLog object at 0x15f9f...


# Calculate Similarities

In [10]:
 cross_df = queries_sets_df[["Query ID"]].merge(songs_sets_df[["Song ID"]], how="cross")

In [14]:
cross_df

Unnamed: 0,Query ID,Song ID
0,q1,1
1,q1,4
2,q1,7
3,q1,14
4,q1,15
...,...,...
56753,q118,2419
56754,q118,2421
56755,q118,2424
56756,q118,2429


In [12]:
def get_overlap_coef(a,b):
    inter_set = a.intersection(b)
    den = min(len(a),len(b))
    if(den == 0):
        return -1
    return len(inter_set)/den

def get_set_similarity(query_id, song_id, set_col):
    query_set = queries_sets_df[queries_sets_df["Query ID"]==query_id].iloc[0][set_col]
    song_set = songs_sets_df[songs_sets_df["Song ID"]==song_id].iloc[0][set_col]
    return get_overlap_coef(query_set, song_set)

In [15]:
# SIMILARITY CALCULATE BY SETS
for set_col in set_cols:
    sim_col = f"similarity_{set_col}_baseline"
    cross_df[sim_col] = cross_df.apply(lambda x: get_set_similarity(x["Query ID"],x["Song ID"], set_col), axis=1)
    cross_df.to_pickle(f"./checkpoint/{set_col}.pkl")

In [31]:
# HLL++ FUNCTIONS
def get_merged_hll(hll1, hll2):
    m_hll = copy.copy(hll1)
    m_hll.update(hll2)
    return m_hll

def estimate_overlapping_coef(hll1: hyperloglog.HyperLogLog, hll2:hyperloglog.HyperLogLog):
    card_1 = hll1.card()
    card_2 = hll2.card()
    m_hll = get_merged_hll(hll1, hll2)
    union_card = m_hll.card()
    dem = min(card_1,card_2)
    if(dem == 0):
        return -1
    return  (card_1+card_2-union_card)/dem
def get_set_similarity_by_hll(query_id, song_id, set_col, hll_p):
    hll_col = f"{set_col}_hll_{hll_p}"
    query_hll = queries_sets_df[queries_sets_df["Query ID"]==query_id].iloc[0][hll_col]
    song_hll = songs_sets_df[songs_sets_df["Song ID"]==song_id].iloc[0][hll_col]
    return estimate_overlapping_coef(query_hll, song_hll)

In [32]:
# SIMILARITY BY HLL
for set_col in set_cols:
    for hll_p in hll_p_list:
        sim_col = f"similarity_{set_col}_hll_{hll_p}"
        cross_df[sim_col] = cross_df.apply(lambda x: get_set_similarity_by_hll(x["Query ID"],x["Song ID"], set_col, hll_p), axis=1)
        cross_df.to_pickle(f"./checkpoint/{sim_col}.pkl")

IOStream.flush timed out


In [39]:
cross_df = pd.read_pickle("./checkpoint/similarity_UpSameDown-multi_gram_5_10_set_hll_16.pkl")

In [44]:
non_vocals_song_ids = [1081,126,1212, 1536, 1655, 995,1786]
cross_df = cross_df[~cross_df["Song ID"].isin(non_vocals_song_ids)]
queries_sets_df = queries_sets_df[~queries_sets_df["Song ID"].isin(non_vocals_song_ids)]

In [46]:
# compute index for each query
def get_index_of_song_in_query(query_id,song_id, col_sim):
    query_results = cross_df[cross_df["Query ID"] == query_id]
    query_results = query_results.sort_values(by=[col_sim], ascending=False)
    list_results = query_results["Song ID"].tolist()
    if(song_id in list_results):
        return list_results.index(song_id)+1
    else:
        return None

In [47]:
queries_indexes_df = queries_sets_df[["Query ID","Song ID"]].copy()
for set_col in set_cols:
    col_index = f"{set_col}_index"
    sim_col = f"similarity_{set_col}_baseline"
    queries_indexes_df[col_index] = queries_indexes_df.apply(lambda x:get_index_of_song_in_query(x["Query ID"],x["Song ID"],sim_col), axis=1)        

In [48]:
for set_col in set_cols:
    for hll_p in hll_p_list:
        col_index = f"{set_col}_hll_{hll_p}_index"
        sim_col = f"similarity_{set_col}_hll_{hll_p}"
        queries_indexes_df[col_index] = queries_indexes_df.apply(lambda x:get_index_of_song_in_query(x["Query ID"],x["Song ID"],sim_col), axis=1)

In [49]:
queries_indexes_df

Unnamed: 0,Query ID,Song ID,pitch_diff-ngram_5_set_index,UpSameDown-ngram_5_set_index,pitch_diff-ngram_8_set_index,UpSameDown-ngram_8_set_index,pitch_diff-ngram_10_set_index,UpSameDown-ngram_10_set_index,pitch_diff-ngram_12_set_index,UpSameDown-ngram_12_set_index,...,UpSameDown-ngram_12_set_hll_12_index,UpSameDown-ngram_12_set_hll_16_index,pitch_diff-multi_gram_5_10_set_hll_4_index,pitch_diff-multi_gram_5_10_set_hll_8_index,pitch_diff-multi_gram_5_10_set_hll_12_index,pitch_diff-multi_gram_5_10_set_hll_16_index,UpSameDown-multi_gram_5_10_set_hll_4_index,UpSameDown-multi_gram_5_10_set_hll_8_index,UpSameDown-multi_gram_5_10_set_hll_12_index,UpSameDown-multi_gram_5_10_set_hll_16_index
0,q1,1118,401,425,398,406,398,398,398,398,...,165,127,440,266,211,140,116,151,342,451
1,q2,1438,14,143,59,102,59,178,59,113,...,433,390,26,332,290,391,400,166,133,100
2,q3,1546,58,9,7,37,6,196,6,30,...,469,62,25,271,447,14,87,8,17,10
3,q4,1579,16,56,47,76,46,119,46,92,...,308,113,183,417,333,270,397,17,63,63
4,q5,1808,1,159,181,209,181,246,181,206,...,331,300,153,101,38,1,416,100,143,201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,q114,479,1,69,284,19,284,45,284,258,...,8,411,23,77,39,6,144,77,16,29
114,q115,1747,239,93,205,61,205,178,205,251,...,205,433,181,198,52,139,148,270,108,67
115,q116,2024,228,453,236,326,236,185,236,245,...,308,105,195,250,328,382,90,424,445,444
116,q117,281,337,329,334,396,334,334,334,334,...,334,334,338,384,433,167,443,470,432,453


In [50]:
queries_indexes_df.to_pickle("./queries_indexes_df.pkl")