In [2]:
import pandas as pd
import numpy as np

In [3]:
def load_data(filename):
    data = pd.read_csv(filename)
    data['similar'] = data['similar'].apply(lambda x: x.split('/'))
    data['artist_terms'] = data['artist_terms'].apply(lambda x: str(x).split('/'))
    data['artist_terms_weights'] = data['artist_terms_weights'].apply(lambda x: str(x).split('/'))
    
    l_max = data['loudness'].max()
    l_min = data['loudness'].min()
    
    t_max = data['tempo'].max()
    t_min = data['tempo'].min()
    
    data['tempo'] = (data['tempo'] - t_min)/(t_max - t_min)
    data['loudness'] = (data['loudness'] - l_min)/(l_max - l_min)

    return data

In [4]:
def distance(song1, song2, alphas = [1,1,1,1,1]):
    """
    song1, song2 : python native lists    format : [artist, title, album, similar, hottness, terms, terms-weights, loudness, tempo]
    alphas : python native list
    
    """
    
    alpha_hot, alpha_loud, alpha_tempo, alpha_similar, alpha_terms = alphas
    artist, title1, album1, similar1, hot1, terms1, weights1, loud1, tempo1 = song1
    artist2, title2, album2, similar2, hot2, terms2, weights2, loud2, tempo2 = song2
    
    distance = 0

    distance += alpha_hot*abs(hot1-hot2) + alpha_loud*abs(loud1-loud2) + alpha_tempo*abs(tempo1-tempo2)
    distance += alpha_similar*(1 - len([singer for singer in similar1 if singer in similar2])/100)

    shared_terms = [term for term in terms1 if (term in terms2 and term != '')]
    shared_weights1, shared_weights2 = [float(weights1[terms1.index(term)]) for term in shared_terms], [float(weights2[terms2.index(term)]) for term in shared_terms]
    distance -= alpha_terms*sum([0.5*(shared_weights1[i] + shared_weights2[i]) for i in range(len(shared_terms)) ])

    return distance

In [5]:
def build_distances(df: pd.core.frame.DataFrame):
    M = len(df)
    distances = [[0 for i in range(M)] for j in range(M)]
    for j in range(M):
        for i in range(M):
            distances[j][i] = distance(df.iloc[j], df.iloc[i])
    return distances