## Import necessary libraries

In [61]:
import numpy as np
import pandas as pd
import math
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from progress.bar import IncrementalBar
import pickle
from sklearn.metrics import ndcg_score, dcg_score
from sklearn.preprocessing import MinMaxScaler

## Uploading datasets

In [2]:
df_names = pd.read_table('id_information_mmsr.tsv')
df_names = df_names.drop(df_names[df_names['id'] == '03Oc9WeMEmyLLQbj'].index)
df_names.head(5)

Unnamed: 0,id,artist,song,album_name
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition)
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002)
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te


In [3]:
df_bert = pd.read_table('id_lyrics_bert_mmsr.tsv')
df_bert = df_bert.drop(df_bert[df_bert['id'] == '03Oc9WeMEmyLLQbj'].index)
df_bert.head(5)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,h48f46ZsT9h0Z5Dm,-0.015378,0.041258,-0.004096,-0.061025,-0.012853,0.028773,-0.045051,-0.016451,-0.007118,...,-0.042241,-0.014954,0.047022,-0.015661,-0.007859,0.016969,-0.021641,-0.003476,-0.012301,-0.022135
1,PV5EXN6AIVBqvsLO,0.040497,0.061796,-0.017192,-0.012942,-0.040331,0.019082,0.001662,-0.036522,-0.022991,...,-0.025968,-0.023056,-0.019058,0.010935,-0.010183,-0.011044,-0.03608,-0.002666,-0.005304,-0.016923
2,z0PzOinYUixzCTGQ,0.002563,0.024931,-0.021617,-0.016013,0.002885,-0.016344,-0.070383,-0.032799,-0.004034,...,-0.043801,0.004436,0.047337,0.011598,-0.000111,0.041575,0.001195,0.013947,-0.003072,-0.002914
3,LZUFTnAB77X15RSz,-0.006931,0.05865,0.009551,-0.046672,0.00276,-0.041869,-0.046719,-0.006524,-0.017413,...,-0.020559,-0.007128,0.049158,-0.020805,-0.019466,0.004021,-0.015934,-0.038246,-0.026774,-0.005274
4,nASk24WIR6Eihbut,0.011613,0.034645,-0.009625,-0.029288,0.011753,0.009602,-0.018495,-0.026956,-0.023878,...,-0.025768,-0.011833,0.027585,0.016521,0.001613,-0.04685,-0.010579,0.02419,-0.03107,-0.037652


In [4]:
df_word2vec = pd.read_table('id_lyrics_word2vec_mmsr.tsv')
df_word2vec = df_word2vec.drop(df_word2vec[df_word2vec['id'] == '03Oc9WeMEmyLLQbj'].index)
df_word2vec.head(5)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,vo7FwIGCk6YHfkIA,0.033718,0.014378,0.026128,0.042627,-0.048268,0.015804,0.032547,-0.020543,0.025851,...,-0.018858,0.016099,-0.040894,-0.005447,-0.01622,-0.014547,0.017679,-0.040378,-0.015388,-0.003768
1,KbiCjEDMzzHYn6zF,0.032313,0.01301,0.024825,0.028559,-0.024753,-0.007209,0.018488,-0.037101,0.031233,...,-0.04438,0.023839,-0.051918,0.023653,-4.8e-05,0.011242,0.023039,-0.017589,0.015194,-0.01718
2,pzQHBx7Y3N1uVlpT,0.030761,0.016845,0.019547,0.045954,-0.022321,-0.002361,0.03786,-0.028647,0.029722,...,-0.042202,0.037827,-0.035356,0.017195,-0.032505,-0.011599,-0.000328,-0.034397,-0.002424,-0.022258
3,xwJc7oM2dkbh17Sy,0.045185,0.01367,0.025823,0.032239,-0.008347,-0.015055,0.018252,-0.024198,0.032537,...,-0.004318,0.021038,-0.059454,0.025127,-0.013767,-0.015213,-0.002923,-0.024353,0.008792,-0.004122
4,Gh4yUdMJrPrPAJjb,0.017956,0.018557,0.020527,0.042157,-0.019683,-0.00484,0.021483,-0.030211,0.0236,...,-0.014065,0.03784,-0.047416,-0.011087,-0.024351,-0.012404,0.009918,-0.0281,0.011513,0.001832


In [5]:
df_tfidf = pd.read_table('id_lyrics_tf-idf_mmsr.tsv')
df_tfidf = df_tfidf.drop(df_tfidf[df_tfidf['id'] == '03Oc9WeMEmyLLQbj'].index)
df_tfidf.head(5)

Unnamed: 0,id,abl,accept,across,act,addict,afraid,age,ago,ah,...,yea,yeah,year,yellow,yes,yesterday,yet,yo,young,youth
0,h48f46ZsT9h0Z5Dm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.149783,0.0,0.0,0.0,0.0
1,PV5EXN6AIVBqvsLO,0.0,0.0,0.0,0.0,0.0,0.327025,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,z0PzOinYUixzCTGQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LZUFTnAB77X15RSz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,nASk24WIR6Eihbut,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3 chosen songs

In [6]:
song1_id = 'ziT77Si01mOb5oZg'
song1 = df_names[df_names['id'] == 'ziT77Si01mOb5oZg']; song1

Unnamed: 0,id,artist,song,album_name
10051,ziT77Si01mOb5oZg,Frank Sinatra,Jingle Bells,Ultimate Christmas


In [7]:
song2_id = 'cVZd1wCtRYIqRnaV'
song2 = df_names[df_names['id'] == 'cVZd1wCtRYIqRnaV']; song2

Unnamed: 0,id,artist,song,album_name
6209,cVZd1wCtRYIqRnaV,Stevie Wonder,Happy Birthday,Hotter Than July


In [8]:
song3_id = 'h0Jaex0Pdbn3aVXv'
song3 = df_names[df_names['id'] == 'h0Jaex0Pdbn3aVXv']; song3

Unnamed: 0,id,artist,song,album_name
6970,h0Jaex0Pdbn3aVXv,Green Day,Wake Me Up When September Ends,American Idiot


## Function returns artist and song name with given id

In [9]:
def song_info(id_):
    return df_names[df_names['id'] == id_]

## Task 1. Random Baseline.

Regardless of the query track, this retrieval system randomly selects N tracks from the rest of the catalog. Make sure that the system produces new results for each query / run.

In [10]:
def random_baseline(song_id, n_retrieved):
    return df_names.loc[(df_names['id'] != song_id)].sample(n=n_retrieved)

In [11]:
random_baseline(song1_id, 10)

Unnamed: 0,id,artist,song,album_name
5567,YKTnZCH4ynRi5GAW,Whitney Houston,If I Told You That,My Love Is Your Love
4779,TFGOLgaXWeOgjPZB,Deathspell Omega,First Prayer,"Si Monumentum Requires, Circumspice"
9722,xWiOsPbEMh5AGBcO,Los Angeles De Charly,Un Sueño,Un Sueño
6329,dKgW8TWUQEkrWNxx,Megadeth,We the People,Th1rt3en
4408,R299Ek2s6duUufTp,Dave Stewart,Heart Of Stone,Greetings From The Gutter
1174,74uyXcvkfhzI1YJb,Galaxie 500,Don't Let Our Youth Go to Waste,Peel Sessions
13,079OXXqfC6koNC3B,Aiden,One Love,Conviction
1291,7mcRquWxlc4uvWQD,Hollywood Undead,Black Dahlia,Swan Songs
3489,LSbtaNWBHzNKHa2D,Jex Thoth,Nothing Left to Die,Jex Thoth
2042,CIFZmQkOQQ4Y40YI,Julieta Venegas,Canciones De Amor,Limon Y Sal


In [12]:
random_baseline(song2_id, 10)

Unnamed: 0,id,artist,song,album_name
2558,FWgz1hVIq59Ve6SY,PJ Harvey,On Battleship Hill,Let England Shake
1882,BIB78kmrA1zWsQr5,Art of Dying,Get Through This,Art of Dying
1427,8YRNYHuhRlBIKMN1,Xiu Xiu,Dr. Troll,Knife Play
5283,WUQwg3HyiTo7i35A,Grimes,Belly of the Beat,Art Angels
5635,YrBggDDGL9aF4mgY,Dead Kennedys,Rawhide,"Plastic Surgery Disasters/In God We Trust, Inc."
2896,HaRuIk8jhyisuHfH,The Meters,Fire on the Bayou,Fire On The Bayou
7135,hwM4yvaymUCuH11P,Nelly Furtado,Be OK,The Spirit Indestructible (Deluxe Version)
54,0KEwSB8tPsFGrv8I,Silverstein,Brookfield,Short Songs
634,3kAIoRnx8aJhupuI,Three Days Grace,The Real You,Human
5880,aPIHQ5FrkcywhYs6,Dilsinho,12 Horas,12 Horas


In [13]:
random_baseline(song3_id, 10)

Unnamed: 0,id,artist,song,album_name
4048,OvClJpZlofWbkj4F,Shannon and the Clams,It's Gonna Go Away,Onion
7864,mOwhoDPh4nMkGwqc,Kate Voegele,Who You Are Without Me,A Fine Mess (Deluxe)
4058,Ox9RbAMdARpA4qBK,Merda,Nirvana dos Pobres,Carlos
9835,yHeHLXeI0YnQIWhT,Unknown Mortal Orchestra,American Guilt,Sex & Food
2381,EP90KqtSjQv1O77E,At the Drive-In,Sleepwalk Capsules,Relationship Of Command
2170,D1p6ITKuwRAs1DDS,Aurora,Under the Water,All My Demons Greeting Me as a Friend (Deluxe)
4352,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco
5320,WiinvNsURKU6cRWE,Chelsea Wolfe,Sick,Pain Is Beauty
5897,aV6iiLELM1HOrME0,Jason Mraz,The Remedy (I Won't Worry),Waiting For My Rocket To Come
3862,NlcvgDSe08fm6Xxs,October Tide,October Insight,Grey Dawn


## Task 2. Text-based(cos-sim, tf-idf).

Given a query, this retrieval system selects the N tracks that are most similar to the query track. The similarity is measured as cosine similarity between the tf-idf representations of the lyrics of the tracks. I.e.

**𝑠𝑖𝑚(𝑞𝑢𝑒𝑟𝑦, 𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘) = 𝑐𝑜𝑠(𝑡𝑓_𝑖𝑑𝑓(𝑞𝑢𝑒𝑟𝑦), 𝑡𝑓_𝑖𝑑𝑓(𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘))**


In [14]:
def cos_sim_tfidf(song_id, n_retrieved):
    song_vec = df_tfidf.loc[df_tfidf['id'] == song_id] #target song vector
    df_temp = df_tfidf.loc[(df_tfidf['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    cosine_sim = cosine_similarity(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['cos_sim'] = cosine_sim #add column with counted similarity
    ids = df_temp.sort_values(by='cos_sim', ascending=False).head(n_retrieved)[['id', 'cos_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result


In [15]:
cos_sim_tfidf(song1_id, 10)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,u8bj2RyzoYZ99dWB,0.963528,Gwen Stefani,Jingle Bells,You Make It Feel Like Christmas (Deluxe Edition)
1,blZ9zSQBqOMxcPhN,0.601615,Vanessa Carlton,Hear the Bells,Hear The Bells
2,eU30OjpKt9zzV6R6,0.5269,Lil Xan,Saved by the Bell,TOTAL XANARCHY
3,JtwyzoBa2N48HsHo,0.507277,The Beatles,Michelle,Rubber Soul (Remastered)
4,xSbRgzIgvXuoeLpL,0.457886,The Black Heart Procession,Your Church Is Red,2
5,hxGDHGbn3KTlfp6d,0.407539,Metallica,For Whom The Bell Tolls - Remastered,Ride The Lightning (Deluxe Remaster)
6,J4onmjAmjdnYYbpX,0.372241,The Faint,Southern Belles In London Sing,Wet From Birth
7,mTbTSXakQDcIH7MK,0.365819,Dire Straits,Portobello Belle,Communiqué
8,PK2m4Mc7MnPT6az8,0.331044,Gregory and the Hawk,Voice Like a Bell,Moenie and Kitchi
9,pZijIGqH2GBH2U3X,0.31738,The Free Design,Kites Are Fun,The Best Of The Free Design: Kites Are Fun


In [16]:
cos_sim_tfidf(song2_id, 10)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,QGPyoU4oyDpmGm0m,0.892669,Square Heads,Happy,"Dancefloor Paradise, Vol. 2"
1,eSpC62OBSPJmrFu7,0.858457,Al Bano & Romina Power,Felicità,"Songs from Sanremo, the Best of the Fest"
2,GyqCH8loNjnRjawN,0.857303,Korn,Dead,Follow The Leader
3,oLmtBolG45CcMgz0,0.748004,Charlie Brown JR.,Proibida Pra Mim (Grazon),Transpiração Continua Prolongada
4,H9UigE72iHKkRNpk,0.719876,Silvio Rodríguez,Pequeña serenata diurna,Días y Flores
5,GsS3uiZvZoH8ARUT,0.68918,Arvingarna,I Do,"Raggarbilshits, Vol. 4 - Raggarrock & Rockabilly"
6,31nn0gSAPo0rAhB6,0.671809,A Fine Frenzy,Happier,Bomb In A Birdcage
7,rK4DAgNAHJaV9HtE,0.667695,Os Mutantes,Balada Do Louco,20 Grandes Sucessos De Os Mutantes
8,CdPc16EXMPRHOEb2,0.657794,Al Green,Love and Happiness,I'm Still in Love with You
9,81AdkjflYHfLuWDr,0.59088,Björk,Moon,Biophilia


In [17]:
cos_sim_tfidf(song3_id, 10)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,CeT7g4rNcepZjBVp,0.535697,Emigrate,Wake Up,Emigrate
1,sqKauAUDFoIc6EJR,0.521682,Lostprophets,Wake Up (Make a Move),Start Something
2,Ux3rD6Ys6oLdtmYO,0.501407,Goldfinger,Open Your Eyes,Open Your Eyes
3,Wcg7LhYoTvJtmsY4,0.482077,The Rigs,All the King's Men,Gray
4,kCjoZhap3XdiCWsC,0.421284,Against the Current,Wasteland,In Our Bones
5,LqxTLLYk5xYqvlM8,0.41167,Nightwish,End of All Hope,Decades
6,plqyJyRTIreqCTfQ,0.404016,Evanescence,Bring Me to Life,Fallen
7,YgAA3RDfYbqnSQtr,0.403344,Cellar Darling,Black Moon,This Is the Sound
8,t1qQ9tz2kXYVjKHr,0.397533,Youth of Today,Wake up and Live,We're Not In This Alone
9,Cd9coQC9xk9saea7,0.391455,Pet Shop Boys,I Want to Wake Up,Actually


## Task 3. Text-based(cos-sim, \<feature>)

Similar to Text-based(cos-sim, tf-idf), however choose a different text-based feature instead of tf-idf (i.e., word2vec or BERT representations of the lyrics)

**𝑠𝑖𝑚(𝑞𝑢𝑒𝑟𝑦, 𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘) = 𝑐𝑜𝑠(< 𝑓𝑒𝑎𝑡𝑢𝑟𝑒 > (𝑞𝑢𝑒𝑟𝑦), < 𝑓𝑒𝑎𝑡𝑢𝑟𝑒 > (𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘))**

In [18]:
def cos_sim_bert(song_id, n_retrieved):
    song_vec = df_bert.loc[df_bert['id'] == song_id] #target song vector
    df_temp = df_bert.loc[(df_bert['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    cosine_sim = cosine_similarity(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['cos_sim'] = cosine_sim #add column with counted similarity
    ids = df_temp.sort_values(by='cos_sim', ascending=False).head(n_retrieved)[['id', 'cos_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result

In [19]:
cos_sim_bert(song1_id, 10)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,u8bj2RyzoYZ99dWB,0.951122,Gwen Stefani,Jingle Bells,You Make It Feel Like Christmas (Deluxe Edition)
1,doTmvQlJVL1JRO4V,0.662801,Robert Johnson,Hellhound On My Trail,King Of The Delta Blues
2,SJZTstFdLSYvbRAi,0.623144,Change,The Glow of Love,The Glow Of Love
3,YzXWwWKeFMKNgkU7,0.617038,Hot Chip,"You Ride, We Ride, In My Ride",Coming On Strong
4,9ScGeeaW8XcxgePd,0.603126,Kelly Clarkson,Every Christmas,Wrapped In Red
5,zHozLx4GhJsG7xLJ,0.598624,Cyndi Lauper,Christmas Conga,Merry Christmas...Have A Nice Life
6,LArarDy0SyTJDoIZ,0.597165,Eric Clapton,Circus,Unplugged (Deluxe Edition)
7,afmSDk2caOd8CCfx,0.592372,B*Witched,Rollercoaster,C'est la Vie: The Collection
8,cItkoIyGIr9LrIJS,0.590946,Beirut,Elephant Gun,Elephant Gun
9,aYfhVF6MlwbLPm0i,0.590712,Rihanna,We Ride,A Girl Like Me


In [20]:
cos_sim_bert(song2_id, 10)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,fs640kJd58UgtiG7,0.694113,Vanessa da Mata,Meu Aniversário,"Bicicletas, Bolos E Outras Alegrias"
1,AFvEsDvmJ09O9pdS,0.555861,Kool & The Gang,Celebration - Single Version,Celebration / Morning Star
2,0zhmog3KHJTjp2BX,0.554781,Alicia Keys,New Day,Girl On Fire
3,2Q44O3s5OQrptQ3Z,0.552973,Regina Spektor,Human of the Year,Far
4,51IQ5d92dMnJtrWQ,0.513817,Jason Mraz,Have It All,Know.
5,CcR8OC6ncESxbL7I,0.508457,The Aquabats,"Hello, Good Night",The Aquabats! vs the Floating Eye of Death! an...
6,D96QLsUhVAMDuT8Z,0.504234,Mariah Carey,Auld Lang Syne (The New Year's Anthem),Merry Christmas II You
7,JtoOtR9E14ZA4Gbj,0.497811,Xiu Xiu,Luber,Knife Play
8,tP9GWTDSQyLlRAYo,0.495605,Wang Chung,Everybody Have Fun Tonight,Mosaic
9,GwO4jQ8fWWKnH56l,0.482777,Carlos Baute,Te regalo,Baute


In [21]:
cos_sim_bert(song3_id, 10)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,0tzYEg8KodbJHl8p,0.735971,Pink Floyd,Fat Old Sun,Atom Heart Mother
1,S2zUefe3rhSEbpVr,0.724557,Summoning,Where Hope and Daylight Die,Stronghold
2,tf0I08yIKJr8NVpx,0.720513,Los Tres,Déjate Caer,La Espada & la Pared
3,VXClwkVXs0JFKWHd,0.719888,Boyz II Men,4 Seasons Of Loneliness,Evolution
4,Q6hGRbMaOnbg4UdS,0.716033,Five Finger Death Punch,When the Seasons Change,And Justice for None (Deluxe)
5,iRBJ4a4wpFlzHVIX,0.713933,Heart,Soul of the Sea,Dreamboat Annie
6,ydS4NguMgrD2NgdG,0.713904,Oasis,Falling Down,Falling Down - EP
7,Edn4Ls88ymmQKn8w,0.712668,Electric Light Orchestra,Summer and Lightning,Out of the Blue
8,4zOF93S2zfaD5h1a,0.711576,Dinah Washington,September in the Rain,The Collection
9,ZctViWjWYSA62gVU,0.710427,Draconian,Rivers Between Us,Sovran


## Task 4. Text-based(\<similarity>, \<feature>)

Similar to Text-based(cos-sim, <feature>), however choose a new combination of similarity measure and text-based feature (e.g., you can use cos-sim with a representation of the lyrics not selected for previous systems yet)

    
**𝑠𝑖𝑚(𝑞𝑢𝑒𝑟𝑦, 𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘) = <𝑠𝑖𝑚𝑖𝑙𝑎𝑟𝑖𝑡𝑦> (<𝑓𝑒𝑎𝑡𝑢𝑟𝑒>(𝑞𝑢𝑒𝑟𝑦), <𝑓𝑒𝑎𝑡𝑢𝑟𝑒>(𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘))**

In [22]:
def euc_sim(song_id, n_retrieved):
    song_vec = df_word2vec.loc[df_word2vec['id'] == song_id] #target song vector
    df_temp = df_word2vec.loc[(df_word2vec['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    euc_sim = euclidean_distances(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['euc_sim'] = euc_sim #add column with counted similarity
    ids = df_temp.sort_values(by='euc_sim', ascending=True).head(n_retrieved)[['id', 'euc_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result

In [23]:
euc_sim(song1_id, 10)

Unnamed: 0,id,euc_sim,artist,song,album_name
0,u8bj2RyzoYZ99dWB,0.073473,Gwen Stefani,Jingle Bells,You Make It Feel Like Christmas (Deluxe Edition)
1,dyPSAz5u5gGVbs9R,0.253046,Nat King Cole,The Christmas Song (Merry Christmas To You),The Christmas Song (Expanded Edition)
2,zHozLx4GhJsG7xLJ,0.264748,Cyndi Lauper,Christmas Conga,Merry Christmas...Have A Nice Life
3,4gMuVWDrTgZsGrlR,0.2685,Pigeon John,The Bomb,Dragon Slayer
4,W5corTpjrk7QEawo,0.269803,Faun,Walpurgisnacht,Luna
5,tbF2tUUkAXDzk7sZ,0.271496,Kult,Celina,Tata Kazika
6,QPUaa2orGuoeXzwk,0.271715,Little Feat,Dixie Chicken,Dixie Chicken
7,V9weUlcyzPJ2qbRn,0.272722,Caravan,Winter Wine,In The Land Of Grey And Pink
8,CCP8zIK9PsURC8VV,0.273672,Ella Fitzgerald,Manhattan,Ella Fitzgerald Sings The Rodgers And Hart Son...
9,40FzEJFj72Vme3Kn,0.273943,Led Zeppelin,Bron-Y-Aur Stomp,Led Zeppelin III (Remastered)


In [24]:
euc_sim(song2_id, 10)

Unnamed: 0,id,euc_sim,artist,song,album_name
0,qmAwzbPTnoB5Ud8f,0.234464,Dead Fish,Bem-Vindo ao Clube,Zero e Um
1,H9UigE72iHKkRNpk,0.239648,Silvio Rodríguez,Pequeña serenata diurna,Días y Flores
2,GsS3uiZvZoH8ARUT,0.24318,Arvingarna,I Do,"Raggarbilshits, Vol. 4 - Raggarrock & Rockabilly"
3,ASZ3TvYABS68ym1Y,0.248095,Emicida,Fica Mais um Pouco Amor,Pra Quem Já Mordeu um Cachorro por Comida Até ...
4,aSk7qb8ig31liJIy,0.249444,Metallica,Sabbra Cadabra,"Garage, Inc."
5,QGPyoU4oyDpmGm0m,0.251775,Square Heads,Happy,"Dancefloor Paradise, Vol. 2"
6,3ZqnBypsQP9ukEiQ,0.25213,Só Pra Contrariar,Essa Tal Liberdade,Só Pra Contrariar
7,ire6sDD2ryFx62Vx,0.253008,Supertramp,My Kind Of Lady,Famous Last Words (Remastered)
8,AakQO4qHS9HwrKm6,0.253158,Alanis Morissette,Unsent,Supposed Former Infatuation Junkie
9,8zc42n00FUtlDOAT,0.255237,Johnny Cash,Call Your Mother,Out Among The Stars


In [25]:
euc_sim(song3_id, 10)

Unnamed: 0,id,euc_sim,artist,song,album_name
0,hFBwWlfhBbw9jKXV,0.23548,Midlake,Children Of The Grounds,The Courage of Others (Deluxe)
1,nqyxZqm7A9fptn5V,0.239017,Wild Beasts,End Come Too Soon,Smother
2,hsI2T9Or9aptYVy0,0.24609,Electric Wizard,Dunwich,Witchcult Today
3,Wcg7LhYoTvJtmsY4,0.247607,The Rigs,All the King's Men,Gray
4,Dr5vijFUZDaMyGsg,0.25038,Tom Waits,Please Wake Me Up,Franks Wild Years
5,MRKfMY8CKVHj42Ur,0.251849,Simon & Garfunkel,The Times They Are a-Changin',"Wednesday Morning, 3 A.M."
6,tUS2uk30ob9oNuH5,0.251931,Kula Shaker,Tattva,Kollected - The Best Of Kula Shaker
7,N1QLEVutobFBMeEL,0.253083,Mad Season,Long Gone Day,Above
8,a83pn8nBj23Vaxwn,0.253126,Periphery,Letter Experiment,Periphery
9,OZ48jM8qOIuMrATq,0.253201,Arab Strap,The First Big Weekend,Arab Strap


# Task 2. 
# Extend your framework with audio-based retrieval systems and with evaluation metrics.

## Uploading new datasets

In [26]:
df_genres = pd.read_table('datasets2/id_genres_mmsr.tsv')
print(df_genres.shape)
df_genres.head(3)

(10094, 2)


Unnamed: 0,id,genre
0,01Yfj2T3YTwJ1Yfy,"['rock', 'christian rock']"
1,01gyRHLquwXDlhkO,"['hip hop', 'rap', 'grindcore', 'death metal']"
2,01rMxQv6vhyE1oQX,"['rock', 'pop punk']"


In [27]:
df_blf_correlation = pd.read_table('datasets2/id_blf_correlation_mmsr.tsv')
print(df_blf_correlation.shape)
df_blf_correlation.head(3)

(10094, 1327)


Unnamed: 0,id,BLF_CORR0000,BLF_CORR0001,BLF_CORR0002,BLF_CORR0003,BLF_CORR0004,BLF_CORR0005,BLF_CORR0006,BLF_CORR0007,BLF_CORR0008,...,BLF_CORR1316,BLF_CORR1317,BLF_CORR1318,BLF_CORR1319,BLF_CORR1320,BLF_CORR1321,BLF_CORR1322,BLF_CORR1323,BLF_CORR1324,BLF_CORR1325
0,01Yfj2T3YTwJ1Yfy,0.000214,6.7e-05,1.7e-05,8.7e-05,5.8e-05,2.5e-05,3.6e-05,4.1e-05,5e-05,...,0.00058,0.000464,0.000425,0.000394,0.000613,0.000538,0.000502,0.000662,0.000628,0.000674
1,01gyRHLquwXDlhkO,0.000398,0.000209,0.000155,0.000209,0.00024,0.000224,0.000161,0.000173,0.000206,...,0.000691,0.000648,0.000647,0.000628,0.000697,0.000673,0.000655,0.000698,0.000683,0.000722
2,01rMxQv6vhyE1oQX,0.000298,0.000195,0.000171,0.000155,0.00015,0.000123,0.000126,0.000195,0.000123,...,0.000696,0.000651,0.00063,0.000555,0.000714,0.000685,0.000593,0.000717,0.000632,0.000654


In [28]:
df_blf_deltaspectral = pd.read_table('datasets2/id_blf_deltaspectral_mmsr.tsv')
print(df_blf_deltaspectral.shape)
df_blf_deltaspectral.head(3)

(10094, 1373)


Unnamed: 0,id,BLF_DELTASPEC0000,BLF_DELTASPEC0001,BLF_DELTASPEC0002,BLF_DELTASPEC0003,BLF_DELTASPEC0004,BLF_DELTASPEC0005,BLF_DELTASPEC0006,BLF_DELTASPEC0007,BLF_DELTASPEC0008,...,BLF_DELTASPEC1362,BLF_DELTASPEC1363,BLF_DELTASPEC1364,BLF_DELTASPEC1365,BLF_DELTASPEC1366,BLF_DELTASPEC1367,BLF_DELTASPEC1368,BLF_DELTASPEC1369,BLF_DELTASPEC1370,BLF_DELTASPEC1371
0,01Yfj2T3YTwJ1Yfy,0.001019,0.001776,0.002718,0.003279,0.004129,0.005231,0.006555,0.008247,0.009574,...,0.001386,0.001666,0.002214,0.002593,0.002973,0.003554,0.004606,0.005651,0.008254,0.0103
1,01gyRHLquwXDlhkO,0.000765,0.001683,0.002564,0.003738,0.004624,0.005544,0.00689,0.008402,0.009779,...,0.002544,0.003277,0.004547,0.005516,0.006514,0.007409,0.009188,0.011052,0.013919,0.017609
2,01rMxQv6vhyE1oQX,0.000678,0.001656,0.002383,0.003411,0.004727,0.005761,0.006907,0.008467,0.010124,...,0.001708,0.00207,0.00278,0.003646,0.004317,0.005285,0.006619,0.008662,0.013065,0.017379


In [29]:
df_blf_logfluc = pd.read_table('datasets2/id_blf_logfluc_mmsr.tsv')
print(df_blf_logfluc.shape)
df_blf_logfluc.head(3)

(10094, 3627)


Unnamed: 0,id,BLF_LOGFLUC0000,BLF_LOGFLUC0001,BLF_LOGFLUC0002,BLF_LOGFLUC0003,BLF_LOGFLUC0004,BLF_LOGFLUC0005,BLF_LOGFLUC0006,BLF_LOGFLUC0007,BLF_LOGFLUC0008,...,BLF_LOGFLUC3616,BLF_LOGFLUC3617,BLF_LOGFLUC3618,BLF_LOGFLUC3619,BLF_LOGFLUC3620,BLF_LOGFLUC3621,BLF_LOGFLUC3622,BLF_LOGFLUC3623,BLF_LOGFLUC3624,BLF_LOGFLUC3625
0,01Yfj2T3YTwJ1Yfy,12.67,22.0678,27.61,33.9833,37.2812,56.5949,67.1749,63.3629,62.9385,...,149.858,259.296,168.937,154.707,171.351,157.083,110.29,128.348,126.531,127.228
1,01gyRHLquwXDlhkO,19.6428,39.935,61.3363,61.0409,52.5288,49.235,70.3674,95.1341,91.6213,...,314.139,310.399,445.451,657.284,310.006,242.571,240.38,327.039,234.797,168.6
2,01rMxQv6vhyE1oQX,47.7222,54.3341,51.7435,58.862,72.509,72.269,88.1807,110.241,105.247,...,368.808,406.217,424.9,307.085,282.668,335.363,233.361,181.856,218.627,216.821


In [30]:
df_blf_spectral = pd.read_table('datasets2/id_blf_spectral_mmsr.tsv')
print(df_blf_spectral.shape)
df_blf_spectral.head(3)

(10094, 981)


Unnamed: 0,id,BLF_SPEC0000,BLF_SPEC0001,BLF_SPEC0002,BLF_SPEC0003,BLF_SPEC0004,BLF_SPEC0005,BLF_SPEC0006,BLF_SPEC0007,BLF_SPEC0008,...,BLF_SPEC0970,BLF_SPEC0971,BLF_SPEC0972,BLF_SPEC0973,BLF_SPEC0974,BLF_SPEC0975,BLF_SPEC0976,BLF_SPEC0977,BLF_SPEC0978,BLF_SPEC0979
0,01Yfj2T3YTwJ1Yfy,-0.024948,-0.02097,-0.018111,-0.015447,-0.013916,-0.0119,-0.010288,-0.008677,-0.007709,...,-0.020602,-0.019743,-0.0191,-0.018427,-0.017445,-0.016213,-0.014709,-0.013374,-0.011931,-0.009248
1,01gyRHLquwXDlhkO,-0.030967,-0.026333,-0.023328,-0.021668,-0.019498,-0.016839,-0.015814,-0.013334,-0.010337,...,-0.019705,-0.017564,-0.015868,-0.013447,-0.010809,-0.008927,-0.007321,-0.005487,-0.004354,-0.002419
2,01rMxQv6vhyE1oQX,-0.03293,-0.02668,-0.022407,-0.020073,-0.018294,-0.015516,-0.014381,-0.012597,-0.011017,...,-0.017512,-0.01574,-0.013784,-0.012927,-0.011979,-0.010119,-0.008757,-0.006146,-0.00453,-0.003551


In [31]:
df_blf_spectralcontrast = pd.read_table('datasets2/id_blf_spectralcontrast_mmsr.tsv')
print(df_blf_spectralcontrast.shape)
df_blf_spectralcontrast.head(3)

(10094, 801)


Unnamed: 0,id,BLF_SPEC_CTRS000,BLF_SPEC_CTRS001,BLF_SPEC_CTRS002,BLF_SPEC_CTRS003,BLF_SPEC_CTRS004,BLF_SPEC_CTRS005,BLF_SPEC_CTRS006,BLF_SPEC_CTRS007,BLF_SPEC_CTRS008,...,BLF_SPEC_CTRS790,BLF_SPEC_CTRS791,BLF_SPEC_CTRS792,BLF_SPEC_CTRS793,BLF_SPEC_CTRS794,BLF_SPEC_CTRS795,BLF_SPEC_CTRS796,BLF_SPEC_CTRS797,BLF_SPEC_CTRS798,BLF_SPEC_CTRS799
0,01Yfj2T3YTwJ1Yfy,0.022039,0.024196,0.02515,0.027585,0.027636,0.028242,0.028864,0.028893,0.03054,...,0.016565,0.016715,0.0174,0.017658,0.018747,0.019526,0.020388,0.021293,0.023746,0.024963
1,01gyRHLquwXDlhkO,0.02655,0.029981,0.03121,0.03362,0.034753,0.035093,0.03665,0.037344,0.038178,...,0.016232,0.016667,0.016972,0.017353,0.018233,0.018639,0.019531,0.020057,0.021328,0.023968
2,01rMxQv6vhyE1oQX,0.025287,0.027151,0.029092,0.031247,0.032969,0.034853,0.035937,0.036726,0.037167,...,0.016187,0.016409,0.016655,0.01719,0.017506,0.017768,0.0188,0.019388,0.020185,0.021915


In [32]:
df_blf_vardeltaspectral = pd.read_table('datasets2/id_blf_vardeltaspectral_mmsr.tsv')
print(df_blf_vardeltaspectral.shape)
df_blf_vardeltaspectral.head(3)

(10094, 1345)


Unnamed: 0,id,BLF_VARDELTASPEC0000,BLF_VARDELTASPEC0001,BLF_VARDELTASPEC0002,BLF_VARDELTASPEC0003,BLF_VARDELTASPEC0004,BLF_VARDELTASPEC0005,BLF_VARDELTASPEC0006,BLF_VARDELTASPEC0007,BLF_VARDELTASPEC0008,...,BLF_VARDELTASPEC1334,BLF_VARDELTASPEC1335,BLF_VARDELTASPEC1336,BLF_VARDELTASPEC1337,BLF_VARDELTASPEC1338,BLF_VARDELTASPEC1339,BLF_VARDELTASPEC1340,BLF_VARDELTASPEC1341,BLF_VARDELTASPEC1342,BLF_VARDELTASPEC1343
0,01Yfj2T3YTwJ1Yfy,0.003155,0.003171,0.003297,0.003305,0.003286,0.00331,0.003451,0.003573,0.003793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,01gyRHLquwXDlhkO,0.002139,0.002204,0.002274,0.002412,0.002491,0.002545,0.002631,0.002779,0.002918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3e-05,0.000373,0.001137
2,01rMxQv6vhyE1oQX,0.005088,0.005149,0.005265,0.00521,0.005175,0.005162,0.005091,0.005073,0.005083,...,1.9e-05,7.3e-05,0.000146,0.000326,0.00038,0.000526,0.000682,0.000788,0.001042,0.001372


In [33]:
df_ivec256 = pd.read_table('datasets2/id_ivec256_mmsr.tsv')
print(df_ivec256.shape)
df_ivec256.head(3)

(10094, 101)


Unnamed: 0,id,i000,i001,i002,i003,i004,i005,i006,i007,i008,...,i090,i091,i092,i093,i094,i095,i096,i097,i098,i099
0,9ErLUJOzu2Lvqwbq,-0.550995,-1.27902,-1.24256,1.519003,0.44714,0.631381,1.992752,-0.125328,0.695475,...,0.944976,-0.641641,0.826282,0.917799,-0.221755,-0.352181,-0.080357,-0.168821,0.594059,0.381152
1,MTWv5ooA00iAD8Ms,0.416361,-0.574562,0.548378,1.226106,-0.511511,0.364488,-0.396014,-0.141275,0.941728,...,-0.841524,-0.150413,0.002063,-0.988203,0.555925,-0.25478,-0.61988,-0.288284,-0.451433,-0.448236
2,mTwXhqc4op8iTl4j,0.757799,0.010479,-1.139051,-0.470724,0.657703,-0.41888,2.199934,0.470342,-0.857265,...,-0.759628,-1.454212,0.057121,1.704225,0.851462,-0.682136,-0.156861,0.507655,-0.265312,0.2941


In [34]:
df_ivec512 = pd.read_table('datasets2/id_ivec512_mmsr.tsv')
print(df_ivec512.shape)
df_ivec512.head(3)

(10094, 201)


Unnamed: 0,id,i000,i001,i002,i003,i004,i005,i006,i007,i008,...,i190,i191,i192,i193,i194,i195,i196,i197,i198,i199
0,9ErLUJOzu2Lvqwbq,-0.182113,-1.164677,1.217906,1.523667,0.250631,0.972648,-1.546015,-0.324394,1.25998,...,0.388401,-0.971913,-0.061513,-0.634554,-0.059094,0.073818,-0.163722,-0.766171,-0.407999,0.782485
1,MTWv5ooA00iAD8Ms,-0.50111,-0.687928,-0.564722,1.124452,-0.627486,0.373352,0.735315,0.055225,1.296702,...,-0.204821,1.04515,-0.421246,-0.067555,-1.093585,0.347184,-0.078408,-1.069177,0.002857,-0.9578
2,mTwXhqc4op8iTl4j,0.07192,-0.036369,1.097213,-0.424786,0.657797,-0.181957,-2.140793,-0.663082,-0.359884,...,0.761319,-0.826769,0.667669,1.226585,0.080257,0.313287,0.29468,-0.358866,0.078518,0.928089


In [35]:
df_ivec1024 = pd.read_table('datasets2/id_ivec1024_mmsr.tsv')
print(df_ivec1024.shape)
df_ivec1024.head(3)

(10094, 401)


Unnamed: 0,id,i000,i001,i002,i003,i004,i005,i006,i007,i008,...,i390,i391,i392,i393,i394,i395,i396,i397,i398,i399
0,9ErLUJOzu2Lvqwbq,-0.369911,1.17734,1.312063,1.567216,-0.236066,1.036267,-0.865492,-1.155326,0.305451,...,-0.544016,0.478495,-0.910013,-0.110639,-0.191088,0.258979,-0.738401,-0.474054,0.019739,-0.558993
1,MTWv5ooA00iAD8Ms,-0.605797,0.635961,-0.552902,1.230392,0.523198,0.435429,0.243145,0.616434,0.078302,...,-0.180578,1.637344,0.318935,1.783739,0.084676,0.049646,0.253696,-1.441109,-0.424085,0.821698
2,mTwXhqc4op8iTl4j,0.367864,-0.009319,1.120976,-0.452208,-0.52904,-0.275539,-0.707632,-2.208474,0.172713,...,-0.559399,0.338486,0.125648,1.239078,-0.129108,0.705474,-1.129312,0.101266,-0.022682,0.007973


In [36]:
df_mfcc_bow = pd.read_table('datasets2/id_mfcc_bow_mmsr.tsv')
print(df_mfcc_bow.shape)
df_mfcc_bow.head(3)

(10094, 501)


Unnamed: 0,id,mfccB000,mfccB001,mfccB002,mfccB003,mfccB004,mfccB005,mfccB006,mfccB007,mfccB008,...,mfccB490,mfccB491,mfccB492,mfccB493,mfccB494,mfccB495,mfccB496,mfccB497,mfccB498,mfccB499
0,GDGURAgKxNPEuXzM,0.0,0.0,2.09691,0.0,0.0,0.0,1.982271,2.722634,0.69897,...,1.740363,2.604226,0.0,0.0,1.30103,0.0,0.0,0.0,1.278754,2.235529
1,gdHmkEd0uy5HzhlM,0.69897,0.0,2.143015,0.0,0.30103,0.0,2.945961,2.139879,0.90309,...,1.653212,1.819544,0.0,0.30103,1.50515,0.0,0.30103,1.176091,2.371068,2.367356
2,y6nm14TvZDNrdBbc,1.491362,0.0,1.414973,0.0,0.30103,0.0,2.78533,2.245513,1.322219,...,1.826075,2.605305,0.30103,0.0,0.0,1.431364,0.0,0.30103,2.326336,2.730782


In [37]:
df_mfcc_stats = pd.read_table('datasets2/id_mfcc_stats_mmsr.tsv')
print(df_mfcc_stats.shape)
df_mfcc_stats.head(3)

(10094, 105)


Unnamed: 0,id,MFCC000,MFCC001,MFCC002,MFCC003,MFCC004,MFCC005,MFCC006,MFCC007,MFCC008,...,cov_9_9,cov_9_10,cov_9_11,cov_9_12,cov_10_10,cov_10_11,cov_10_12,cov_11_11,cov_11_12,cov_12_12
0,9ErLUJOzu2Lvqwbq,24.062305,-24.529358,0.992749,11.64836,9.184493,7.888308,3.549652,3.135545,-4.26757,...,93.488292,33.682528,-3.576354,-21.538019,76.832969,22.308741,-10.750334,64.227492,25.354757,75.786649
1,MTWv5ooA00iAD8Ms,24.377205,-9.601337,-0.591649,-1.563359,-3.887833,-5.740052,-3.548983,-4.365711,3.474253,...,143.537407,76.636133,33.551934,31.9129,146.343073,61.383011,33.601789,95.405675,40.896044,87.043994
2,mTwXhqc4op8iTl4j,23.252638,-14.410173,-9.571658,7.999164,0.506061,7.472953,2.90832,14.135953,-3.479433,...,90.249964,21.49207,11.622519,7.554839,91.891904,16.390867,7.394409,79.469247,23.994299,51.304981


In [38]:
df_musicnn = pd.read_table('datasets2/id_musicnn_mmsr.tsv')
print(df_musicnn.shape)
df_musicnn.head(3)

(10094, 51)


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,01rMxQv6vhyE1oQX,0.251818,0.004347,0.078437,0.056584,0.012362,0.088567,0.06189,0.171026,0.045924,...,0.055074,0.011236,0.000931,0.00109,0.007918,0.020941,0.060759,0.003712,0.040641,0.00138
1,01Yfj2T3YTwJ1Yfy,0.129031,0.001123,0.006577,0.082643,0.002042,0.095536,0.049312,0.686902,0.154331,...,0.004771,0.005219,0.000255,0.000172,0.008303,0.026122,0.002091,0.243107,0.005494,0.000828
2,01gyRHLquwXDlhkO,0.026824,0.000873,0.009361,0.313308,0.002543,0.126715,0.176702,0.053236,0.131477,...,0.02447,0.036648,8.8e-05,0.000145,0.006926,0.038333,0.005486,0.003181,0.0207,0.001349


## Retrival systems (new four)

### **Audio-based(\<similarity\>, MFCC):** 
Similar to Text-based(\<similarity\>, \<feature\>), however choose as feature one of the representations of the MFCCs (BoW or statistical descriptors).

In [39]:
def cos_sim_mfcc(song_id, n_retrieved):
    song_vec = df_mfcc_bow.loc[df_mfcc_bow['id'] == song_id] #target song vector
    df_temp = df_mfcc_bow.loc[(df_mfcc_bow['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    cosine_sim = cosine_similarity(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['cos_sim'] = cosine_sim #add column with counted similarity
    ids = df_temp.sort_values(by='cos_sim', ascending=False).head(n_retrieved)[['id', 'cos_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result

In [40]:
cos_sim_mfcc(song1_id, 5)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,5FyAwzQMj5n6SAvk,0.949796,Bob Marley & The Wailers,Small Axe,The Complete Upsetter Collection
1,s4LTPAlO9IN14O2h,0.943829,Michael Kiwanuka,Any Day Will Do Fine,Home Again
2,RGtndAGQNSdi0Yzb,0.94261,Santana,Smooth (feat. Rob Thomas),Supernatural (Remastered)
3,blJfTtBbirLvxzXx,0.942596,Simon & Garfunkel,Last Night I Had the Strangest Dream,"Wednesday Morning, 3 A.M."
4,x8M8VsXIxI7hSW6Y,0.941477,Gilberto Gil,Esotérico,Um Banda Um


### **Audio-based(\<similarity\>, \<feature\>):** 
Implement three retrieval systems, similar to Audio-based(\<similarity>, \<feature>). Choose as feature:
**at least one of the BLFs**

In [41]:
def cos_sim_blf(song_id, n_retrieved):
    song_vec = df_blf_logfluc.loc[df_blf_logfluc['id'] == song_id] #target song vector
    df_temp = df_blf_logfluc.loc[(df_blf_logfluc['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    cosine_sim = cosine_similarity(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['cos_sim'] = cosine_sim #add column with counted similarity
    ids = df_temp.sort_values(by='cos_sim', ascending=False).head(n_retrieved)[['id', 'cos_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result

In [42]:
cos_sim_blf(song1_id, 5)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,6DiqMk5OTw4xE8tG,0.97367,Elvis Costello,Alison,My Aim Is True
1,BtMqsXzovF8ErgnV,0.972743,Rebecca Ferguson,Fairytale (Let Me Live My Life This Way),Heaven
2,iLQhFM4jLbwfzWUO,0.972463,Herbert Grönemeyer,Demo (Letzter Tag),Was Muss Muss - Best Of
3,eeWobtoefMYDwkyt,0.971719,Van Morrison,T. B. Sheets,Blowin' Your Mind!
4,hCu2X5Tke9ATZ4uV,0.971106,Juanes,Volverte A Ver,Mi Sangre


### **Audio-based(\<similarity\>, \<feature\>):** 
Implement three retrieval systems, similar to Audio-based(\<similarity>, \<feature>). Choose as feature:
**at least one of the i-vectors**

In [43]:
def cos_sim_ivec(song_id, n_retrieved):
    song_vec = df_ivec1024.loc[df_ivec1024['id'] == song_id] #target song vector
    df_temp = df_ivec1024.loc[(df_ivec1024['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    cosine_sim = cosine_similarity(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['cos_sim'] = cosine_sim #add column with counted similarity
    ids = df_temp.sort_values(by='cos_sim', ascending=False).head(n_retrieved)[['id', 'cos_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result

In [44]:
cos_sim_ivec(song1_id, 5)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,KFbfr0fWWCTGXIJj,0.19274,Ingrid Michaelson,You And I,Be OK
1,rD4GCW87Qaoc3ytr,0.187248,One Night Only,Nothing Left,Can You Feel It
2,sP1S3wvw4rfgUoG0,0.185808,Cyndi Lauper,Rain on Me,Bring Ya To The Brink
3,DNARs8zFlQJeVRY7,0.18428,Grateful Dead,Cumberland Blues,Workingman's Dead
4,FjpuaUU3m7bP27e8,0.181342,Take That,Pray,Everything Changes


### **Audio-based(\<similarity\>, \<feature\>):** 
Implement three retrieval systems, similar to Audio-based(\<similarity>, \<feature>). Choose as feature:
**DNN-based features (musicnn)**

In [45]:
def cos_sim_musicnn(song_id, n_retrieved):
    song_vec = df_musicnn.loc[df_musicnn['id'] == song_id] #target song vector
    df_temp = df_musicnn.loc[(df_musicnn['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    cosine_sim = cosine_similarity(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['cos_sim'] = cosine_sim #add column with counted similarity
    ids = df_temp.sort_values(by='cos_sim', ascending=False).head(n_retrieved)[['id', 'cos_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result

In [46]:
cos_sim_musicnn(song1_id, 5)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,eyJANYgffN1L2zlu,0.989368,Israel Houghton,Others,Love God. Love People. (The London Sessions)
1,Va7m59PuAzNti3VZ,0.987775,Westlife,More Than Words,Westlife
2,sxlF5zsaV3LGkdUs,0.987664,The Byrds,Satisfied Mind,Turn! Turn! Turn!
3,FazIOZxQqINAZlfv,0.98572,Tom Waits,Last Leaf,Bad As Me (Deluxe Edition Remastered)
4,L76yXbX4hMnIwhiE,0.985252,John Legend,All of Me,Love In The Future (Expanded Edition)


## Evaluation part

### Accuracy:
#### Precision@k & Recall@k: 
according to the definition given in the lecture.
Consider top **k** retrieved items. For the purposes of precision and recall
calculation, a retrieved track is relevant to the query track if the two tracks have at least one genre in common. Allow for evaluation with different lengths of the returned lists (i.e., consider **k** as a parameter in the evaluation).

Compute the average of **Precision@k** and of **Recall@k** over all possible query tracks.

Plot Precision-Recall curve for each of the **8** evaluated systems by varying **k** in the interval **[1, 100]**.

In [47]:
def get_genres(song_id):
    list_genres = df_genres.loc[df_genres['id'] == song_id]['genre'].values[0]
    list_genres = list_genres[1:-1].split("\'")
    result = list_genres[1::2]
    return result

In [352]:
set(get_genres('ziT77Si01mOb5oZg'))

{'bells', 'easy listening', 'jazz', 'lounge', 'swing'}

In [None]:
# датасет количество True and False для каждой песни
df_num_TandF = pd.DataFrame(columns=['id', 'T', 'F'])
for ids in df_genres['id'].values:
    T = 0 #number of songs with at leat one common genre
    F = 0 #number of song with differeint genres
    genres_query = get_genres(ids)
    for rest in df_genres.loc[(df_genres['id'] != ids)]['id'].values:
        genres_rest = get_genres(rest)
        if len(set(genres_query).intersection(set(genres_rest))) > 0:
            T += 1
            continue
        else:
            F += 1
            continue
    df_num_TandF.loc[len(df_num_TandF)] = [ids, T, F]     

In [None]:
df_num_TandF

In [None]:
def prec_rec(song_id, retrieved_system, n_retrieved):
    song = song_info(song_id)
    if retrieved_system == 'random_baseline':
        retrieved_df = random_baseline(song, n_retrieved)
    elif retrieved_system == 'cos_sim_tfidf':
        retrieved_df = cos_sim_tfidf(song, n_retrieved)
    elif retrieved_system == 'cos_sim_bert':
        retrieved_df = cos_sim_bert(song, n_retrieved)
    elif retrieved_system == 'euc_sim':
        retrieved_df = euc_sim(song, n_retrieved)
    elif retrieved_system == 'cos_sim_mfcc':
        retrieved_df = cos_sim_mfcc(song, n_retrieved)
    elif retrieved_system == 'cos_sim_blf':
        retrieved_df = cos_sim_blf(song, n_retrieved)
    elif retrieved_system == 'cos_sim_ivec':
        retrieved_df = cos_sim_ivec(song, n_retrieved)
    elif retrieved_system == 'cos_sim_musicnn':
        retrieved_df = cos_sim_musicnn(song, n_retrieved)
        
    query_genres = get_genres(song_id)
    for song_res_id in retrieved_df['id'].to_list():
        res_genres = get_genres(song_res_id)
        if len(set(query_genres).intersection(set(res_genres))) > 0:
            TP += 1
        else:
            FP += 1
        TN = df_num_TandF.loc[(df_genres['id'] == song_id)][T] - TP
        FN = df_num_TandF.loc[(df_genres['id'] == song_id)][F] - FP
    prec = TP / (TP + FP)
    rec = TP / (TP + FN)
    return prec, rec

#### nDCG10 according to the following definition:
$\displaystyle DCG@10 = rel_1 + \sum \big[\frac{rel_i}{log_2(i + 1)}\big]$

$\displaystyle nDCG@10 = \frac{DCG@10}{IDCG@10}$,
where **IDCG** stands for the ideal **DCG**, i.e. the maximum value of **DCG**
obtainable for a query track. This is the value obtained when retrieving the **10** tracks that have the highest relevance for the given query, ranked in order of descending relevance.

For the relevance $rel$, use the Sørensen–Dice coefficient of the genres: 

$\displaystyle rel_i = 2 \cdot \frac{|G_{query} \cap G_i|}{|G_{query}| + |G_i|}$

Given a query labeled with genres $G_{query}$ and a track retrieved at position $i$ and labeled with genres $G_i$, this coefficient compares the number of overlapping genres, $|G_{query} \cap G_i|$, to the average number of genres of the query and of the track retrieved at position $i$, $\frac{(|G_{query}| + |G_i|)}{2}$

Compute the average of $nDCG@10$ over all possible query tracks.


In [None]:
def rel(query_song, song):
    G_query = get_genre(query_song)
    G_song = get_genre(song)
    intercept = G_query.intercept(G_song)
    return 2 * len(intercept) / (len(G_query) + len(G_song))

In [None]:
retrieved_df = random_baseline(song, 10)
for i in retrieved_df['id'].values:
    retrieved_df.loc[(retrieved_df['id'] == i)]['rel'] = rel(query_song, i)

### Beyond accuracy:
#### Genre coverage@10: 
Is calculated for a set of queries (use all queries). This evaluation criterion shows how many out of all genres present in the data (assigned to at least one track) are covered by (present in) retrieved results for all queries. Genre coverage is a proportion: **number of unique genres assigned to at least one of the top 10 retrieved tracks for at least one of the test queries** divided by the **number of unique genres in the dataset**.

In [494]:
#all genres
all_genres = set()
for i in df_genres['id'].values:
    all_genres.update(get_genres(i))

In [495]:
len(all_genres)

1112

In [521]:
def coverage(retrieved_system):
    if retrieved_system == 'random_baseline':
        all_retrieved_genres = set()
        for i in df_genres['id'].values:
            retrieved_df = random_baseline(i, n_retrieved=10)
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
        return len(all_retrieved_genres) / len(all_genres)
    
    elif retrieved_system == 'cos_sim_tfidf':
        all_retrieved_genres = set()
        for i in df_genres['id'].values:
            retrieved_df = cos_sim_tfidf(i, n_retrieved=10)
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
        return len(all_retrieved_genres) / len(all_genres)
    
    elif retrieved_system == 'cos_sim_bert':
        all_retrieved_genres = set()
        for i in df_genres['id'].values:
            retrieved_df = cos_sim_bert(i, n_retrieved=10)
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
        return len(all_retrieved_genres) / len(all_genres)
    
    elif retrieved_system == 'euc_sim':
        all_retrieved_genres = set()
        for i in df_genres['id'].values:
            retrieved_df = euc_sim(i, n_retrieved=10)
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
        return len(all_retrieved_genres) / len(all_genres)
    
    elif retrieved_system == 'cos_sim_mfcc':
        all_retrieved_genres = set()
        for i in df_genres['id'].values:
            retrieved_df = cos_sim_mfcc(i, n_retrieved=10)
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
        return len(all_retrieved_genres) / len(all_genres)
    
    elif retrieved_system == 'cos_sim_blf':
        all_retrieved_genres = set()
        for i in df_genres['id'].values:
            retrieved_df = cos_sim_blf(i, n_retrieved=10)
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
        return len(all_retrieved_genres) / len(all_genres)
    
    elif retrieved_system == 'cos_sim_ivec':
        all_retrieved_genres = set()
        for i in df_genres['id'].values:
            retrieved_df = cos_sim_ivec(i, n_retrieved=10)
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
        return len(all_retrieved_genres) / len(all_genres)
    
    elif retrieved_system == 'cos_sim_musicnn':
        all_retrieved_genres = set()
        for i in df_genres['id'].values:
            retrieved_df = cos_sim_musicnn(i, n_retrieved=10)
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
        return len(all_retrieved_genres) / len(all_genres)

In [522]:
coverage_random_baseline = coverage('random_baseline')
coverage_cos_sim_tfidf = coverage('cos_sim_tfidf')
coverage_cos_sim_bert = coverage('cos_sim_bert')
coverage_euc_sim = coverage('euc_sim')
coverage_cos_sim_mfcc = coverage('cos_sim_mfcc')
coverage_cos_sim_blf = coverage('cos_sim_blf')
coverage_cos_sim_ivec = coverage('cos_sim_ivec')
coverage_cos_sim_musicnn = coverage('cos_sim_musicnn')

In [523]:
print('Coverage value for (1) random_baseline retrival system: ', coverage_random_baseline)
print('Coverage value for (2) cos_sim_tfidf retrival system: ', coverage_cos_sim_tfidf)
print('Coverage value for (3) cos_sim_bert retrival system: ', coverage_cos_sim_bert)
print('Coverage value for (4) euc_sim retrival system: ', coverage_euc_sim)
print('Coverage value for (5) cos_sim_mfcc retrival system: ', coverage_cos_sim_mfcc)
print('Coverage value for (6) cos_sim_blf retrival system: ', coverage_cos_sim_blf)
print('Coverage value for (7) cos_sim_ivec retrival system: ', coverage_cos_sim_ivec)
print('Coverage value for (8) cos_sim_musicnn retrival system: ', coverage_cos_sim_musicnn)

Coverage value for (1) random_baseline retrival system:  1.0
Coverage value for (2) cos_sim_tfidf retrival system:  0.9820143884892086
Coverage value for (3) cos_sim_bert retrival system:  0.9568345323741008
Coverage value for (4) euc_sim retrival system:  0.9172661870503597
Coverage value for (5) cos_sim_mfcc retrival system:  0.9811151079136691
Coverage value for (6) cos_sim_blf retrival system:  0.960431654676259
Coverage value for (7) cos_sim_ivec retrival system:  1.0
Coverage value for (8) cos_sim_musicnn retrival system:  0.9973021582733813


#### Genre diversity@10: 
Shows how evenly distributed are the genres over the
top 10 retrieved tracks. For each query track first compute **genre distribution** of the corresponding returned tracks:

■ Start with a vector of zeros, with each element corresponding to a unique genre present in the data.

■ Then every retrieved track contributes to the genres it is labeled with. **Note**: a track labeled with a single genre adds **+1** to the corresponding element of the vector, while a track labeled with **n** genres contributes **+(1/n)** to each of the genres it is labeled with.

■ **Example**: Genres in the data set: [ambient, blues, country].
Computing genre distribution for the list of three tracks: (1): [country],
(2): [country, blues], (3): [country]; Resulting genre distribution: [0; 0.5;
2.5].

**Normalize the distribution**, dividing every genre count by 10 (as we are considering top 10 results for each query). Genre diversity@10 for a single query is Shannon’s entropy of the genre distribution over the retrieved tracks
for a given query track. If $G_{res} = \{g_i\} -$ normalized$(\sum g_i = 1)$ distribution of  genre occurrences in the top 10 retrieved results for a given query $(i \subset [1, N]$, where $N$ - number of known genres, i.e. 3 in the example above), Shannon’s entropy of $G_{res}$ is calculated as follows: 

$H(G_{res}) = \sum g_i \cdot log_2 g_i$

**Note!** In case of $g_i = 0$, treat $0 \cdot log_2 0 = 0$

Compute the average of genre-divesity@10 over all possible query tracks.

In [524]:
def diversity(retrieved_system):
    k_retrieved = 10
    if retrieved_system == 'random_baseline':
        entropy_sum = 0
        for i in df_genres['id'].values: 
            #loop for every possible query
            retrieved_df = random_baseline(i, n_retrieved=10)
            all_retrieved_genres = set() #all genres in retrived songs
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
            dic = {}
            for item in all_retrieved_genres:
                dic[item] = 0
            
            for ids in retrieved_df['id'].values: #values for unnormalized vector
                genres = get_genres(ids)
                for genre in genres:
                    dic[genre] += 1 / len(genres) 
            normalized_vec = np.fromiter(dic.values(), dtype=float) / k_retrieved  # normalized vector
            
            entropy = 0
            for i in range(len(normalized_vec)): #entropy
                if normalized_vec[i] == 0:
                    entropy += 0
                else:
                    entropy += normalized_vec[i] * math.log(normalized_vec[i], 2)
            entropy_sum += entropy
        avg_entropy = entropy_sum / len(df_genres['id'].values) # average entropy
        return avg_entropy

    
    elif retrieved_system == 'cos_sim_tfidf':
        entropy_sum = 0
        for i in df_genres['id'].values:
            #loop for every possible query
            retrieved_df = cos_sim_tfidf(i, n_retrieved=10)
            all_retrieved_genres = set() #all genres in retrived songs
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
            
            dic = {}
            for item in all_retrieved_genres:
                dic[item] = 0
            
            for ids in retrieved_df['id'].values: #values for unnormalized vector
                genres = get_genres(ids)
                for genre in genres:
                    dic[genre] += 1 / len(genres) 
            normalized_vec = np.fromiter(dic.values(), dtype=float) / k_retrieved  # normalized vector
            
            entropy = 0
            for i in range(len(normalized_vec)): #entropy
                if normalized_vec[i] == 0:
                    entropy += 0
                else:
                    entropy += normalized_vec[i] * math.log(normalized_vec[i], 2)
            entropy_sum += entropy
        avg_entropy = entropy_sum / len(df_genres['id'].values) # average entropy
        return avg_entropy
    
    elif retrieved_system == 'cos_sim_bert':
        entropy_sum = 0
        for i in df_genres['id'].values: 
            #loop for every possible query
            retrieved_df = cos_sim_bert(i, n_retrieved=10)
            all_retrieved_genres = set() #all genres in retrived songs
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
            
            dic = {}
            for item in all_retrieved_genres:
                dic[item] = 0
            
            for ids in retrieved_df['id'].values: #values for unnormalized vector
                genres = get_genres(ids)
                for genre in genres:
                    dic[genre] += 1 / len(genres) 
            normalized_vec = np.fromiter(dic.values(), dtype=float) / k_retrieved  # normalized vector
            
            entropy = 0
            for i in range(len(normalized_vec)): #entropy
                if normalized_vec[i] == 0:
                    entropy += 0
                else:
                    entropy += normalized_vec[i] * math.log(normalized_vec[i], 2)
            entropy_sum += entropy
        avg_entropy = entropy_sum / len(df_genres['id'].values) # average entropy
        return avg_entropy
    
    elif retrieved_system == 'euc_sim':
        entropy_sum = 0
        for i in df_genres['id'].values: 
            #loop for every possible query
            retrieved_df = euc_sim(i, n_retrieved=10)
            all_retrieved_genres = set() #all genres in retrived songs
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
            
            dic = {}
            for item in all_retrieved_genres:
                dic[item] = 0
            
            for ids in retrieved_df['id'].values: #values for unnormalized vector
                genres = get_genres(ids)
                for genre in genres:
                    dic[genre] += 1 / len(genres) 
            normalized_vec = np.fromiter(dic.values(), dtype=float) / k_retrieved  # normalized vector
            
            entropy = 0
            for i in range(len(normalized_vec)): #entropy
                if normalized_vec[i] == 0:
                    entropy += 0
                else:
                    entropy += normalized_vec[i] * math.log(normalized_vec[i], 2)
            entropy_sum += entropy
        avg_entropy = entropy_sum / len(df_genres['id'].values) # average entropy
        return avg_entropy
    
    elif retrieved_system == 'cos_sim_mfcc':
        entropy_sum = 0
        for i in df_genres['id'].values: 
            #loop for every possible query
            retrieved_df = cos_sim_mfcc(i, n_retrieved=10)
            all_retrieved_genres = set() #all genres in retrived songs
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
            
            dic = {}
            for item in all_retrieved_genres:
                dic[item] = 0
            
            for ids in retrieved_df['id'].values: #values for unnormalized vector
                genres = get_genres(ids)
                for genre in genres:
                    dic[genre] += 1 / len(genres) 
            normalized_vec = np.fromiter(dic.values(), dtype=float) / k_retrieved  # normalized vector
            
            entropy = 0
            for i in range(len(normalized_vec)): #entropy
                if normalized_vec[i] == 0:
                    entropy += 0
                else:
                    entropy += normalized_vec[i] * math.log(normalized_vec[i], 2)
            entropy_sum += entropy
        avg_entropy = entropy_sum / len(df_genres['id'].values) # average entropy
        return avg_entropy
        
    
    elif retrieved_system == 'cos_sim_blf':
        entropy_sum = 0
        for i in df_genres['id'].values: 
            #loop for every possible query
            retrieved_df = cos_sim_blf(i, n_retrieved=10)
            all_retrieved_genres = set() #all genres in retrived songs
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
            
            dic = {}
            for item in all_retrieved_genres:
                dic[item] = 0
            
            for ids in retrieved_df['id'].values: #values for unnormalized vector
                genres = get_genres(ids)
                for genre in genres:
                    dic[genre] += 1 / len(genres) 
            normalized_vec = np.fromiter(dic.values(), dtype=float) / k_retrieved  # normalized vector
            
            entropy = 0
            for i in range(len(normalized_vec)): #entropy
                if normalized_vec[i] == 0:
                    entropy += 0
                else:
                    entropy += normalized_vec[i] * math.log(normalized_vec[i], 2)
            entropy_sum += entropy
        avg_entropy = entropy_sum / len(df_genres['id'].values) # average entropy
        return avg_entropy
        
    
    elif retrieved_system == 'cos_sim_ivec':
        entropy_sum = 0
        for i in df_genres['id'].values: #loop for every possible query
            retrieved_df = cos_sim_ivec(i, n_retrieved=10)
            all_retrieved_genres = set() #all genres in retrived songs
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
            
            dic = {}
            for item in all_retrieved_genres:
                dic[item] = 0
            
            for ids in retrieved_df['id'].values: #values for unnormalized vector
                genres = get_genres(ids)
                for genre in genres:
                    dic[genre] += 1 / len(genres) 
            normalized_vec = np.fromiter(dic.values(), dtype=float) / k_retrieved  # normalized vector
            
            entropy = 0
            for i in range(len(normalized_vec)): #entropy
                if normalized_vec[i] == 0:
                    entropy += 0
                else:
                    entropy += normalized_vec[i] * math.log(normalized_vec[i], 2)
            entropy_sum += entropy
        avg_entropy = entropy_sum / len(df_genres['id'].values) # average entropy
        return avg_entropy        
    
    elif retrieved_system == 'cos_sim_musicnn':
        entropy_sum = 0
        for i in df_genres['id'].values:
            retrieved_df = cos_sim_musicnn(i, n_retrieved=10) 
            all_retrieved_genres = set() #all genres in retrived songs
            for i in retrieved_df['id'].values:
                all_retrieved_genres.update(get_genres(i))
            
            dic = {}
            for item in all_retrieved_genres:
                dic[item] = 0
            
            for ids in retrieved_df['id'].values: #values for unnormalized vector
                genres = get_genres(ids)
                for genre in genres:
                    dic[genre] += 1 / len(genres) 
            normalized_vec = np.fromiter(dic.values(), dtype=float) / k_retrieved  # normalized vector
            
            entropy = 0
            for i in range(len(normalized_vec)): #entropy
                if normalized_vec[i] == 0:
                    entropy += 0
                else:
                    entropy += normalized_vec[i] * math.log(normalized_vec[i], 2)
            entropy_sum += entropy
        avg_entropy = entropy_sum / len(df_genres['id'].values) # average entropy
        return avg_entropy

In [525]:
diversity_random_baseline = diversity('random_baseline')
diversity_cos_sim_tfidf = diversity('cos_sim_tfidf')
diversity_cos_sim_bert = diversity('cos_sim_bert')
diversity_euc_sim = diversity('euc_sim')
diversity_cos_sim_mfcc = diversity('cos_sim_mfcc')
diversity_cos_sim_blf = diversity('cos_sim_blf')
diversity_cos_sim_ivec = diversity('cos_sim_ivec')
diversity_cos_sim_musicnn = diversity('cos_sim_musicnn')

In [526]:
print('Genre diversity value for (1) random_baseline retrival system: ', diversity_random_baseline)
print('Genre diversity value for (2) cos_sim_tfidf retrival system: ', diversity_cos_sim_tfidf)
print('Genre diversity value for (3) cos_sim_bert retrival system: ', diversity_cos_sim_bert)
print('Genre diversity value for (4) euc_sim retrival system: ', diversity_euc_sim)
print('Genre diversity value for (5) cos_sim_mfcc retrival system: ', diversity_cos_sim_mfcc)
print('Genre diversity value for (6) cos_sim_blf retrival system: ', diversity_cos_sim_blf)
print('Genre diversity value for (7) cos_sim_ivec retrival system: ', diversity_cos_sim_ivec)
print('Genre diversity value for (8) cos_sim_musicnn retrival system: ', diversity_cos_sim_musicnn)

Genre diversity value for (1) random_baseline retrival system:  -5.059364321375057
Genre diversity value for (2) cos_sim_tfidf retrival system:  -4.974490672481774
Genre diversity value for (3) cos_sim_bert retrival system:  -4.84589890933647
Genre diversity value for (4) euc_sim retrival system:  -4.833300816772764
Genre diversity value for (5) cos_sim_mfcc retrival system:  -4.743527408164238
Genre diversity value for (6) cos_sim_blf retrival system:  -4.768746626585791
Genre diversity value for (7) cos_sim_ivec retrival system:  -4.9680376867863085
Genre diversity value for (8) cos_sim_musicnn retrival system:  -4.705929703814016


# Task 3. 
#  Video-based retrieval, fusion, evaluation and user interface.

## Uploading new datasets

In [50]:
df_incp = pd.read_table('ws23_exercise3/task3/id_incp_mmsr.tsv')
print(df_incp.shape)
df_incp.head(3)

(10094, 4097)


Unnamed: 0,id,max0000,max0001,max0002,max0003,max0004,max0005,max0006,max0007,max0008,...,mean2038,mean2039,mean2040,mean2041,mean2042,mean2043,mean2044,mean2045,mean2046,mean2047
0,01gyRHLquwXDlhkO,0.176325,0.029896,0.290058,0.124512,0.165575,0.057879,0.906675,0.576814,0.050436,...,0.428855,0.645226,0.187928,1.359423,0.952955,0.076844,0.385273,0.813935,0.980491,1.102469
1,01rMxQv6vhyE1oQX,0.294136,0.034675,0.099776,0.247584,0.062971,0.089765,0.439367,0.068418,0.347261,...,0.065049,1.006248,0.0,0.975929,0.099211,0.347073,0.294506,0.079472,0.015335,1.359715
2,01Yfj2T3YTwJ1Yfy,0.109848,0.130977,0.151458,0.398778,0.42905,0.169885,0.253226,0.089925,0.636247,...,0.355406,0.501278,0.413582,1.578734,0.553293,0.32779,0.084863,0.116205,0.120456,0.240038


In [51]:
df_resnet = pd.read_table('ws23_exercise3/task3/id_resnet_mmsr.tsv')
print(df_resnet.shape)
df_resnet.head(3)

(10094, 4097)


Unnamed: 0,id,max0000,max0001,max0002,max0003,max0004,max0005,max0006,max0007,max0008,...,mean2038,mean2039,mean2040,mean2041,mean2042,mean2043,mean2044,mean2045,mean2046,mean2047
0,01gyRHLquwXDlhkO,0.016343,0.029984,0.018722,0.0,0.17283,0.0,0.932023,0.047154,0.000812,...,0.00119,0.025433,0.016921,0.0,0.26387,0.006393,0.000927,0.901466,0.012855,0.885759
1,01rMxQv6vhyE1oQX,0.000348,0.0,1.073413,0.0,0.097732,0.082244,1.198784,0.061178,0.0,...,0.0,0.853174,0.180942,0.0,0.007214,0.0,0.0,0.0,0.077968,0.952947
2,01Yfj2T3YTwJ1Yfy,0.0,0.110133,0.31062,0.0,0.003017,0.136098,0.053048,0.198124,0.0,...,0.0,0.0,0.248975,0.455842,0.243351,0.0,0.00232,0.0,2.763196,0.182506


In [52]:
df_url = pd.read_table('ws23_exercise3/task3/id_url_mmsr.tsv')
print(df_url.shape)
df_url.head(3)

(10094, 2)


Unnamed: 0,id,url
0,NDroPROgWm3jBxjH,https://www.youtube.com/watch?v=gPm2s6JORc4
1,y8wp2cUBzIEYsouc,https://www.youtube.com/watch?v=RYzQvj3icjs
2,pAzEb1oXeG9TYIvM,https://www.youtube.com/watch?v=juQ2rtxKzZk


In [53]:
df_vgg19 = pd.read_table('ws23_exercise3/task3/id_vgg19_mmsr.tsv')
print(df_vgg19.shape)
df_vgg19.head(3)

(10094, 8193)


Unnamed: 0,id,max0000,max0001,max0002,max0003,max0004,max0005,max0006,max0007,max0008,...,mean4086,mean4087,mean4088,mean4089,mean4090,mean4091,mean4092,mean4093,mean4094,mean4095
0,01gyRHLquwXDlhkO,0.0,3.636418,0.0,0.21202,0.271122,0.001024,2.322615,0.0,0.0,...,0.0,1.325016,3.582322,2.84148,0.532275,0.031329,0.0,0.043304,0.0,0.028034
1,01rMxQv6vhyE1oQX,0.0,0.0,0.0,0.0,0.51692,1.288584,4.989646,0.0,0.0,...,0.0,0.492186,4.2905,0.632023,0.0,1.55641,0.0,1.644141,0.0,0.128084
2,01Yfj2T3YTwJ1Yfy,0.870669,1.899707,0.0,0.0,0.077879,0.843339,0.0,0.0,1.122094,...,0.0,1.348677,1.147605,1.845405,0.0,0.0,0.0,0.0,0.0,0.0


## Retrival systems (new three)

### **Video-based(\<similarity\>, \<feature\>):** 
Similar to Text-based(\<similarity\>, \<feature\>) and Audio-based(\<similarity\>, \<feature\>).  Choose as feature one of the video features (i.e., the representations of the videoclips obtained with one of the neural network architectures, VGG19, ResNet, or inception, and provided at the link below).

**𝑠𝑖𝑚(𝑞𝑢𝑒𝑟𝑦, 𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘) = <𝑠𝑖𝑚𝑖𝑙𝑎𝑟𝑖𝑡𝑦> (<video 𝑓𝑒𝑎𝑡𝑢𝑟𝑒>(𝑞𝑢𝑒𝑟𝑦), <video 𝑓𝑒𝑎𝑡𝑢𝑟𝑒>(𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘))**

In [54]:
def cos_sim_incp(song_id, n_retrieved):
    song_vec = df_incp.loc[df_incp['id'] == song_id] #target song vector
    df_temp = df_incp.loc[(df_incp['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    cosine_sim = cosine_similarity(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['cos_sim'] = cosine_sim #add column with counted similarity
    ids = df_temp.sort_values(by='cos_sim', ascending=False).head(n_retrieved)[['id', 'cos_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result

In [69]:
cos_sim_incp(song1_id, 10)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,9ECfsfc0ChaQQ1bD,0.775037,Yellowcard,Dear Bobbie,Paper Walls
1,q5ShIM6R3OaT495a,0.764987,Future Islands,Walking Through That Door,In Evening Air
2,QfUglc6m1JIYhXRH,0.760943,Patti Smith,Break It Up,Horses (Legacy Edition)
3,Nmiv1udVTPdcCvyk,0.75862,Édith Piaf,Paris,Edith Piaf - The Best Of
4,FvqGWytde6EWaZpE,0.757958,After the Burial,Deluge,Dig Deep
5,TBH4dxkeSVvTsSnV,0.757421,Camila,Coleccionista De Canciones,Todo Cambio
6,rlYEWOpYxNrY5rHO,0.756913,Lisa Mitchell,Neopolitan Dreams,Neopolitan Dreams
7,I696LFToFrDdwsVd,0.75654,Gala,Let a Boy Cry,Come Into My Life (The Album)
8,0r6fQYlWtk4HkN1F,0.756441,Annie Lennox,Train in Vain,Medusa
9,pBS8YUMxWpvLmrfI,0.756286,Oasis,The Masterplan,(What's The Story) Morning Glory? (Deluxe Edit...


### **Early fusion:** 
from all the features provided so far (textual, audio, and video) select two and use an early fusion aggregation technique to combine them into a single feature. Then use the feature resulting from the aggregation in a retrieval similar to Video-based(\<similarity\>, \<feature\>). Motivate your methodological choices, such as the choice of the features to
combine from different modalities, the pre-processing of the features, if any is applied (standardization, normalization, PCA, …).

**𝑠𝑖𝑚(𝑞𝑢𝑒𝑟𝑦, 𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘) = <𝑠𝑖𝑚𝑖𝑙𝑎𝑟𝑖𝑡𝑦> (<aggregated 𝑓𝑒𝑎𝑡𝑢𝑟𝑒>(𝑞𝑢𝑒𝑟𝑦), <aggregated 𝑓𝑒𝑎𝑡𝑢𝑟𝑒>(𝑡𝑎𝑟𝑔𝑒𝑡_𝑡𝑟𝑎𝑐𝑘))**

Chose cos_sim_musicnn because it has the best presicion value

In [58]:
df_musicnn.head(3)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,01rMxQv6vhyE1oQX,0.251818,0.004347,0.078437,0.056584,0.012362,0.088567,0.06189,0.171026,0.045924,...,0.055074,0.011236,0.000931,0.00109,0.007918,0.020941,0.060759,0.003712,0.040641,0.00138
1,01Yfj2T3YTwJ1Yfy,0.129031,0.001123,0.006577,0.082643,0.002042,0.095536,0.049312,0.686902,0.154331,...,0.004771,0.005219,0.000255,0.000172,0.008303,0.026122,0.002091,0.243107,0.005494,0.000828
2,01gyRHLquwXDlhkO,0.026824,0.000873,0.009361,0.313308,0.002543,0.126715,0.176702,0.053236,0.131477,...,0.02447,0.036648,8.8e-05,0.000145,0.006926,0.038333,0.005486,0.003181,0.0207,0.001349


In [59]:
df_resnet.head(3)

Unnamed: 0,id,max0000,max0001,max0002,max0003,max0004,max0005,max0006,max0007,max0008,...,mean2038,mean2039,mean2040,mean2041,mean2042,mean2043,mean2044,mean2045,mean2046,mean2047
0,01gyRHLquwXDlhkO,0.016343,0.029984,0.018722,0.0,0.17283,0.0,0.932023,0.047154,0.000812,...,0.00119,0.025433,0.016921,0.0,0.26387,0.006393,0.000927,0.901466,0.012855,0.885759
1,01rMxQv6vhyE1oQX,0.000348,0.0,1.073413,0.0,0.097732,0.082244,1.198784,0.061178,0.0,...,0.0,0.853174,0.180942,0.0,0.007214,0.0,0.0,0.0,0.077968,0.952947
2,01Yfj2T3YTwJ1Yfy,0.0,0.110133,0.31062,0.0,0.003017,0.136098,0.053048,0.198124,0.0,...,0.0,0.0,0.248975,0.455842,0.243351,0.0,0.00232,0.0,2.763196,0.182506


In [60]:
df_musicnn_resnet = pd.merge(df_musicnn, df_resnet, on='id')
df_musicnn_resnet.head(3)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,mean2038,mean2039,mean2040,mean2041,mean2042,mean2043,mean2044,mean2045,mean2046,mean2047
0,01rMxQv6vhyE1oQX,0.251818,0.004347,0.078437,0.056584,0.012362,0.088567,0.06189,0.171026,0.045924,...,0.0,0.853174,0.180942,0.0,0.007214,0.0,0.0,0.0,0.077968,0.952947
1,01Yfj2T3YTwJ1Yfy,0.129031,0.001123,0.006577,0.082643,0.002042,0.095536,0.049312,0.686902,0.154331,...,0.0,0.0,0.248975,0.455842,0.243351,0.0,0.00232,0.0,2.763196,0.182506
2,01gyRHLquwXDlhkO,0.026824,0.000873,0.009361,0.313308,0.002543,0.126715,0.176702,0.053236,0.131477,...,0.00119,0.025433,0.016921,0.0,0.26387,0.006393,0.000927,0.901466,0.012855,0.885759


In [65]:
scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(df_musicnn_resnet.iloc[:, 1:]), columns=df_musicnn_resnet.iloc[:, 1:].columns)
normalized_df.insert(0, 'id', df_musicnn_resnet['id'])

In [66]:
normalized_df.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,mean2038,mean2039,mean2040,mean2041,mean2042,mean2043,mean2044,mean2045,mean2046,mean2047
0,01rMxQv6vhyE1oQX,0.324495,0.007809,0.173111,0.0641,0.026638,0.146624,0.113437,0.179965,0.064144,...,0.0,0.102034,0.009621,0.0,0.000595,0.0,0.0,0.0,0.007938,0.072438
1,01Yfj2T3YTwJ1Yfy,0.165987,0.001995,0.012139,0.093685,0.004229,0.158335,0.090264,0.723496,0.222102,...,0.0,0.0,0.013238,0.169066,0.020081,0.0,0.000279,0.0,0.281307,0.013873
2,01gyRHLquwXDlhkO,0.034045,0.001545,0.018375,0.355566,0.005317,0.210728,0.324944,0.055861,0.188801,...,0.000177,0.003042,0.0009,0.0,0.021774,0.000536,0.000112,0.06396,0.001309,0.067331
3,02RGE9FNH65RtMS7,0.0,0.002487,0.004104,1.0,0.000531,0.109224,0.904793,0.002664,0.880187,...,0.045647,0.0732,0.058912,0.008135,0.05748,0.02336,0.048846,0.066753,0.059469,0.052715
4,02ZnlCGZEbkfCDxo,0.282779,0.021043,0.118891,0.068638,0.08202,0.263746,0.148817,0.164584,0.109324,...,0.120573,0.050111,0.020257,0.008942,0.018538,0.020898,0.012435,0.246238,0.049053,0.062737


In [67]:
def cos_sim_early_fusion(song_id, n_retrieved):
    song_vec = normalized_df.loc[normalized_df['id'] == song_id] #target song vector
    df_temp = normalized_df.loc[(normalized_df['id'] != song_id)].copy() #make copy id because we want to add new column later, this dataset without target song
    cosine_sim = cosine_similarity(df_temp.iloc[:, 1:], song_vec.iloc[:, 1:]) #similarity between songs from dataset and target song
    df_temp['cos_sim'] = cosine_sim #add column with counted similarity
    ids = df_temp.sort_values(by='cos_sim', ascending=False).head(n_retrieved)[['id', 'cos_sim']] #take 10 the greatest similarity values, we need id and similarity value
    result = pd.merge(ids, df_names, how='left', on='id') #merge table to represent names of found songs
    return result

In [68]:
cos_sim_early_fusion(song1_id, n_retrieved=10)

Unnamed: 0,id,cos_sim,artist,song,album_name
0,TRXNqsIKGCC9IE8l,0.565724,Destroyer,Priest's Knees,Destroyer's Rubies
1,Nmiv1udVTPdcCvyk,0.554673,Édith Piaf,Paris,Edith Piaf - The Best Of
2,pBS8YUMxWpvLmrfI,0.549847,Oasis,The Masterplan,(What's The Story) Morning Glory? (Deluxe Edit...
3,PMYqcZBzklxtziH3,0.549207,Underoath,Anyone Can Dig A Hole But It Takes A Real Man ...,Lost In The Sound Of Separation
4,6S5n0Nt4JXJ5HfME,0.543417,The Jesus and Mary Chain,About You,Darklands (Expanded Version)
5,nZq6qfbCyQb1W3qF,0.54006,Vanguart,Engole (Arde Mais Que Brasa em Pele Quente),Boa Parte de Mim Vai Embora
6,ZldiLYHpQmfOnCoP,0.539951,The Living End,Nowhere Town,State of Emergency
7,uPABePFt36bUYbtI,0.539867,Vanguart,O Que a Gente Podia Ser,Boa Parte de Mim Vai Embora
8,5eXfAMTBhgxWHP9B,0.532797,Tarja,Minor Heaven,My Winter Storm
9,t61gRUmCesgyaG6K,0.531924,Panda Bear,Come to Your Senses,Panda Bear Meets The Grim Reaper


### **Late fusion:** 
this retrieval system combines the **results** of two of the retrieval algorithms developed so far. Select two of the algorithms you already developed, motivate the choice of the selected algorithms (features and similarities), and of the late fusion techniques (e.g., rank or score aggregation), as well as any additional methodological choices (i.e., scaling or weighting of the ranks, or of the scores). **Hint**: use precomputed scores / retrieval results.

In [87]:
df1 = cos_sim_bert(song1_id, 10)
df1

Unnamed: 0,id,cos_sim,artist,song,album_name
0,u8bj2RyzoYZ99dWB,0.951122,Gwen Stefani,Jingle Bells,You Make It Feel Like Christmas (Deluxe Edition)
1,doTmvQlJVL1JRO4V,0.662801,Robert Johnson,Hellhound On My Trail,King Of The Delta Blues
2,SJZTstFdLSYvbRAi,0.623144,Change,The Glow of Love,The Glow Of Love
3,YzXWwWKeFMKNgkU7,0.617038,Hot Chip,"You Ride, We Ride, In My Ride",Coming On Strong
4,9ScGeeaW8XcxgePd,0.603126,Kelly Clarkson,Every Christmas,Wrapped In Red
5,zHozLx4GhJsG7xLJ,0.598624,Cyndi Lauper,Christmas Conga,Merry Christmas...Have A Nice Life
6,LArarDy0SyTJDoIZ,0.597165,Eric Clapton,Circus,Unplugged (Deluxe Edition)
7,afmSDk2caOd8CCfx,0.592372,B*Witched,Rollercoaster,C'est la Vie: The Collection
8,cItkoIyGIr9LrIJS,0.590946,Beirut,Elephant Gun,Elephant Gun
9,aYfhVF6MlwbLPm0i,0.590712,Rihanna,We Ride,A Girl Like Me


In [88]:
print(f"Var(df1) = {df1['cos_sim'].mean()}")
print(f"Var(df1) = {df1['cos_sim'].var()}")

Var(df1) = 0.6427050129175478
Var(df1) = 0.01222947992634781


In [73]:
df2 = cos_sim_incp(song1_id, 10)
df2

Unnamed: 0,id,cos_sim,artist,song,album_name
0,9ECfsfc0ChaQQ1bD,0.775037,Yellowcard,Dear Bobbie,Paper Walls
1,q5ShIM6R3OaT495a,0.764987,Future Islands,Walking Through That Door,In Evening Air
2,QfUglc6m1JIYhXRH,0.760943,Patti Smith,Break It Up,Horses (Legacy Edition)
3,Nmiv1udVTPdcCvyk,0.75862,Édith Piaf,Paris,Edith Piaf - The Best Of
4,FvqGWytde6EWaZpE,0.757958,After the Burial,Deluge,Dig Deep
5,TBH4dxkeSVvTsSnV,0.757421,Camila,Coleccionista De Canciones,Todo Cambio
6,rlYEWOpYxNrY5rHO,0.756913,Lisa Mitchell,Neopolitan Dreams,Neopolitan Dreams
7,I696LFToFrDdwsVd,0.75654,Gala,Let a Boy Cry,Come Into My Life (The Album)
8,0r6fQYlWtk4HkN1F,0.756441,Annie Lennox,Train in Vain,Medusa
9,pBS8YUMxWpvLmrfI,0.756286,Oasis,The Masterplan,(What's The Story) Morning Glory? (Deluxe Edit...


In [83]:
print(f"Var(df2) = {df2['cos_sim'].mean()}")
print(f"Var(df2) = {df2['cos_sim'].var()}")

Var(df2) = 0.7601145712620875
Var(df2) = 3.4715176579310075e-05


In [128]:
def cos_sim_late_fusion(song_id, n_retrieved, alpha=0.5):
    # Get results from the first retrieval system
    result1 = cos_sim_bert(song_id, n_retrieved=10)
    #result1 = result1.rename(columns={'cos_sim':"cos_sim_bert"})
    # Get results from the second retrieval system
    
    result2 = cos_sim_incp(song_id, n_retrieved=10)
    #result2 = result2.rename(columns={'cos_sim':"cos_sim_incp"})
    
    # Merge the two result sets based on song ID
    merged_result = pd.merge(result1, result2, on='id', how='outer', suffixes=('_bert', '_incp'))
    merged_result['cos_sim_bert'] = merged_result['cos_sim_bert'].fillna(0)
    merged_result['cos_sim_incp'] = merged_result['cos_sim_incp'].fillna(0)
    
    # Calculate a combined score using a weighted sum of the cosine similarity scores
    merged_result['combined_score'] = alpha * merged_result['cos_sim_bert'] + (1 - alpha) * merged_result['cos_sim_incp']

    # Rank the songs based on the combined score
    merged_result = merged_result.sort_values(by='combined_score', ascending=False).head(n_retrieved)
    pd.merge(merged_result[['id', 'combined_score']], df_names, how='left', on='id')
    return pd.merge(merged_result[['id', 'combined_score']], df_names, how='left', on='id')

In this example, alpha is a parameter that determines the weight given to the first retrieval system (cos_sim_bert) in the late fusion. You can adjust this parameter based on the performance of each individual system. If one system is more reliable or accurate, you may assign it a higher weight.

In [131]:
cos_sim_late_fusion(song1_id, n_retrieved=10, alpha=0.55)

Unnamed: 0,id,combined_score,artist,song,album_name
0,u8bj2RyzoYZ99dWB,0.523117,Gwen Stefani,Jingle Bells,You Make It Feel Like Christmas (Deluxe Edition)
1,doTmvQlJVL1JRO4V,0.364541,Robert Johnson,Hellhound On My Trail,King Of The Delta Blues
2,9ECfsfc0ChaQQ1bD,0.348767,Yellowcard,Dear Bobbie,Paper Walls
3,q5ShIM6R3OaT495a,0.344244,Future Islands,Walking Through That Door,In Evening Air
4,SJZTstFdLSYvbRAi,0.342729,Change,The Glow of Love,The Glow Of Love
5,QfUglc6m1JIYhXRH,0.342424,Patti Smith,Break It Up,Horses (Legacy Edition)
6,Nmiv1udVTPdcCvyk,0.341379,Édith Piaf,Paris,Edith Piaf - The Best Of
7,FvqGWytde6EWaZpE,0.341081,After the Burial,Deluge,Dig Deep
8,TBH4dxkeSVvTsSnV,0.34084,Camila,Coleccionista De Canciones,Todo Cambio
9,rlYEWOpYxNrY5rHO,0.340611,Lisa Mitchell,Neopolitan Dreams,Neopolitan Dreams
