# Recommender Systems

For Audio files, this will be done through `cosine_similarity` library.

In [1]:
# Libraries
import IPython.display as ipd
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing
import os
general_path = '../Data'


# Read data
data = pd.read_csv(f'{general_path}/extraction.csv', index_col='name_v')

# Extract labels
labels = data[['genre']]

# Drop labels from original dataframe
data = data.drop(columns=['length','genre', 'name', 'filedir'])
data.head()

# Scale the data
data_scaled=preprocessing.scale(data)
print('Scaled data type:', type(data_scaled))

FileNotFoundError: [Errno 2] No such file or directory: './Data/extraction.csv'

### Cosine similarity

Calculates the *pairwise cosine similarity* for each combination of songs in the data. This results in a 1000 x 1000 matrix (with redundancy in the information as item A similarity to item B == item B similarity to item A).

In [36]:
# Cosine similarity
similarity = cosine_similarity(data_scaled)
print("Similarity shape:", similarity.shape)

# Convert into a dataframe and then set the row index and column names as labels
sim_df_labels = pd.DataFrame(similarity)
sim_df_names = sim_df_labels.set_index(labels.index)
sim_df_names.columns = labels.index

sim_df_names.head()

Similarity shape: (9981, 9981)


name_v,pop.00027.0,pop.00027.1,pop.00027.2,pop.00027.3,pop.00027.4,pop.00027.5,pop.00027.6,pop.00027.7,pop.00027.8,pop.00027.9,...,jazz.00033.0,jazz.00033.1,jazz.00033.2,jazz.00033.3,jazz.00033.4,jazz.00033.5,jazz.00033.6,jazz.00033.7,jazz.00033.8,jazz.00033.9
name_v,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pop.00027.0,1.0,0.807001,0.868782,0.76885,0.551887,0.714245,0.545401,0.531134,0.377966,0.306697,...,-0.155334,-0.146826,-0.232629,-0.351097,-0.194513,-0.239168,-0.262043,-0.281815,0.020984,-0.280401
pop.00027.1,0.807001,1.0,0.777653,0.704576,0.4995,0.679036,0.50323,0.459679,0.400605,0.245827,...,-0.297026,-0.244981,-0.360549,-0.406724,-0.221051,-0.318431,-0.319929,-0.386375,-0.124074,-0.381323
pop.00027.2,0.868782,0.777653,1.0,0.805213,0.593357,0.721879,0.641104,0.550327,0.382852,0.303111,...,-0.224905,-0.200519,-0.272655,-0.412025,-0.239069,-0.224884,-0.302504,-0.282898,-0.072536,-0.286256
pop.00027.3,0.76885,0.704576,0.805213,1.0,0.552834,0.614728,0.598119,0.663092,0.31681,0.303914,...,-0.244933,-0.212117,-0.312805,-0.327752,-0.260667,-0.249317,-0.286031,-0.319162,-0.024787,-0.304864
pop.00027.4,0.551887,0.4995,0.593357,0.552834,1.0,0.805979,0.919788,0.57306,0.48237,0.47051,...,0.060258,0.011952,0.064921,-0.116346,-0.048433,-0.031729,-0.040956,-0.005316,-0.151162,-0.004279


### Song similarity scoring

`find_similar_songs()` - is a predefined function that takes the name of the song and returns top 5 best matches for that song.

In [117]:

def find_similar_songs(name,part):
    # Find songs most similar to another song
    series = sim_df_names[name+part].sort_values(ascending = False)
    
    
    # Remove cosine similarity == 1 (songs will always have the best match with themselves)
    for i in range(0,10):
        series = series.drop(name + "."+ str(i))
    
   
    # Display the 5 top matches 
    print("\n*******\nSimilar songs to ", name)
    print(series.head(5))
    return series.head(5)

### Find Similar Song:

Insert Songname below:

In [38]:
# Insert Songname here:
genre = 'rock'
filename = genre + ".00013"
part = ".1"

In [118]:
# pop.00019 - Britney Spears "Hit me baby one more time"
similar_song = find_similar_songs(filename, part) 
print("Playing original Song: "+ filename )

ipd.Audio(filename = f'{general_path}/genres_original/'+ genre + '/' + filename + ".wav")


*******
Similar songs to  rock.00013
name_v
reggae.00008.3     0.586455
hiphop.00018.0     0.583528
reggae.00008.1     0.579721
country.00059.0    0.566385
country.00059.1    0.562132
Name: rock.00013.1, dtype: float64
Playing original Song: rock.00013


### Play Similar Song:
Choose Index which should be played (1 to 5):

In [120]:
index = 1;

if(index > 5): 
    print("Index out of bounds, please stay below 5!")
print("Playing: " + similar_song.index[index -1])
song_genre = similar_song.index[index -1]
song_genre = song_genre.split('.')
songname = similar_song.index[0].split('.')
ipd.Audio(f'{general_path}/genres_original/' + song_genre[0] + '/' + song_genre[0] + "." + songname[1] + '.wav')

Playing: hiphop.00018.0
