In [1]:
import pandas as pd

df_a = pd.read_csv('data/TopArtists.csv')
df_t = pd.read_csv('data/TopTracks.csv')
df_g = pd.read_csv('data/TopGenres.csv')
df_m = pd.read_csv('data/MusicFeatures.csv')

In [2]:
df_a['user_id'].unique()

array([12153521253, 12120382831])

In [3]:
u1_a = df_a.loc[df_a['user_id'] == 12153521253]
u1_t = df_t.loc[df_t['user_id'] == 12153521253]
u1_g = df_g.loc[df_g['user_id'] == 12153521253]
u1_m = df_m.loc[df_m['user_id'] == 12153521253]

In [4]:
u2_a = df_a.loc[df_a['user_id'] == 12120382831]
u2_t = df_t.loc[df_t['user_id'] == 12120382831]
u2_g = df_g.loc[df_g['user_id'] == 12120382831]
u2_m = df_m.loc[df_m['user_id'] == 12120382831]

In [5]:
def top_to_dict(top_df):
    top_dict = {}
    top_dict['Short'] = top_df.loc[top_df['timeframe'] == 'Short'].to_dict('records')
    top_dict['Medium'] = top_df.loc[top_df['timeframe'] == 'Medium'].to_dict('records')
    top_dict['Long'] = top_df.loc[top_df['timeframe'] == 'Long'].to_dict('records')
    return top_dict

In [85]:
def get_artist_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['artist_id', 'artist', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['artist_id', 'artist', 'rank']]
    df = df1.merge(df2, on=['artist_id', 'artist'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_track_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['track_id', 'track', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['track_id', 'track', 'rank']]
    df = df1.merge(df2, on=['track_id', 'track'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_genre_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['genre', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['genre', 'rank']]
    df = df1.merge(df2, on='genre', how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def calculate_similarity(df):
    return round(df.sum()['score'] / df.sum()['base'], 4)

def calculate_feature_similarity(u1, u2, timeframe='Long'):
    features1 = u1.loc[u1['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    features2 = u2.loc[u2['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    scores = []
    for i in range(len(features1)):
        f1 = abs(features1[i])
        f2 = abs(features2[i])
        scores.append(min(f1, f2) / max(f1, f2))
    return round(sum(scores) / len(scores), 4)

def calculate_score(rank, weight=16, shift=4):
    return weight / ((0.1 * rank + shift) ** 2) 

In [121]:
tf_weights = {'Short': 3, 'Medium': 2, 'Long': 1}
mu_weights = {'artist': 4, 'track': 1, 'genre': 8, 'feature': 2}
final_score = 0

for timeframe in ['Short', 'Medium', 'Long']:
    tf_score = 0
    # Artist
    df_artist = get_artist_similarity(u1_a, u2_a, timeframe)
    tf_score += mu_weights['artist'] * calculate_similarity(df_artist)
    # Track
    df_track = get_track_similarity(u1_t, u2_t, timeframe)
    tf_score += mu_weights['track'] * calculate_similarity(df_track)
    # Genre
    df_genre = get_genre_similarity(u1_g, u2_g, timeframe)
    tf_score += mu_weights['genre'] * calculate_similarity(df_genre)
    # Features
    tf_score += mu_weights['feature'] * calculate_feature_similarity(u1_m, u2_m)
    # Timeframe overall score
    tf_score /= sum(mu_weights.values())
    print(tf_score)
    final_score += tf_weights[timeframe] * tf_score

final_score /= sum(tf_weights.values())
final_score

0.22452
0.26435333333333333
0.3034


0.2509444444444444

In [123]:
df_a = get_artist_similarity(u1_a, u2_a, 'Short')
df_a.loc[df_a['score'] > 0]

Unnamed: 0,artist_id,artist,12153521253,12120382831,base,score
3,3mIj9lX2MWuHmhNCA7LSCW,The 1975,4.0,1.0,0.951814,0.826446
7,77SW9BnxLY8rJ0RciFqkHh,The Neighbourhood,8.0,34.0,0.694444,0.292184
8,1Xyo4u8uXC1ZmMpatF05PJ,The Weeknd,9.0,24.0,0.666389,0.390625
10,5cIc3SBFuBLVxJz58W2tU9,Oh Wonder,11.0,29.0,0.615148,0.336064
11,2h93pZq0e7k5yf4dywlkpM,Frank Ocean,12.0,11.0,0.615148,0.591716
13,5K4W6rqBFWDnAN6FQUkS6x,Kanye West,14.0,8.0,0.694444,0.548697
17,1Bl6wpkWCQ4KVgnASpvzzA,BROCKHAMPTON,18.0,40.0,0.475624,0.25


In [124]:
df_g = get_genre_similarity(u1_g, u2_g, 'Short')
df_g.loc[df_g['score'] > 0]

Unnamed: 0,genre,12153521253,12120382831,base,score
0,Pop,1.0,1.0,0.951814,0.951814
1,Electropop,2.0,16.0,0.907029,0.510204
2,Modern Rock,3.0,10.0,0.865333,0.64
6,Modern Alternative Rock,7.0,9.0,0.72431,0.666389
10,Nu Gaze,11.0,11.0,0.615148,0.615148
11,Rock,12.0,12.0,0.591716,0.591716
12,Hip Hop,13.0,8.0,0.694444,0.569598
13,Rap,14.0,4.0,0.826446,0.548697
25,Alternative R&B,26.0,23.0,0.403124,0.367309
26,Lgbtq+ Hip Hop,27.0,39.0,0.356427,0.256369
