In [1]:
import pandas as pd
from secrets import localhost_db, postgres_db
from sqlalchemy import create_engine
from queries import *

df_a = pd.read_csv('data/TopArtists.csv')
df_t = pd.read_csv('data/TopTracks.csv')
df_g = pd.read_csv('data/TopGenres.csv')
df_m = pd.read_csv('data/MusicFeatures.csv')

In [2]:
df_a['user_id'].unique()

array([12153521253, 12120382831])

In [3]:
u1_a = df_a.loc[df_a['user_id'] == 12153521253]
u1_t = df_t.loc[df_t['user_id'] == 12153521253]
u1_g = df_g.loc[df_g['user_id'] == 12153521253]
u1_m = df_m.loc[df_m['user_id'] == 12153521253]

In [4]:
u2_a = df_a.loc[df_a['user_id'] == 12120382831]
u2_t = df_t.loc[df_t['user_id'] == 12120382831]
u2_g = df_g.loc[df_g['user_id'] == 12120382831]
u2_m = df_m.loc[df_m['user_id'] == 12120382831]

In [5]:
def top_to_dict(top_df):
    top_dict = {}
    top_dict['Short'] = top_df.loc[top_df['timeframe'] == 'Short'].to_dict('records')
    top_dict['Medium'] = top_df.loc[top_df['timeframe'] == 'Medium'].to_dict('records')
    top_dict['Long'] = top_df.loc[top_df['timeframe'] == 'Long'].to_dict('records')
    return top_dict

In [10]:
def get_artist_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['artist_id', 'artist', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['artist_id', 'artist', 'rank']]
    df = df1.merge(df2, on=['artist_id', 'artist'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_track_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['track_id', 'track', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['track_id', 'track', 'rank']]
    df = df1.merge(df2, on=['track_id', 'track'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_genre_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['genre', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['genre', 'rank']]
    df = df1.merge(df2, on='genre', how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def calculate_similarity(df):
    return round(df.sum()['score'] / df.sum()['base'], 4)

def calculate_feature_similarity(u1, u2, timeframe='Long'):
    features1 = u1.loc[u1['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    features2 = u2.loc[u2['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    scores = []
    for i in range(len(features1)):
        f1 = abs(features1[i])
        f2 = abs(features2[i])
        scores.append(min(f1, f2) / max(f1, f2))
    return round(sum(scores) / len(scores), 4)

def calculate_score(rank, weight=16, shift=4):
    return weight / ((0.1 * rank + shift) ** 2) 

In [121]:
tf_weights = {'Short': 3, 'Medium': 2, 'Long': 1}
mu_weights = {'artist': 4, 'track': 1, 'genre': 8, 'feature': 2}
final_score = 0

for timeframe in ['Short', 'Medium', 'Long']:
    tf_score = 0
    # Artist
    df_artist = get_artist_similarity(u1_a, u2_a, timeframe)
    tf_score += mu_weights['artist'] * calculate_similarity(df_artist)
    # Track
    df_track = get_track_similarity(u1_t, u2_t, timeframe)
    tf_score += mu_weights['track'] * calculate_similarity(df_track)
    # Genre
    df_genre = get_genre_similarity(u1_g, u2_g, timeframe)
    tf_score += mu_weights['genre'] * calculate_similarity(df_genre)
    # Features
    tf_score += mu_weights['feature'] * calculate_feature_similarity(u1_m, u2_m)
    # Timeframe overall score
    tf_score /= sum(mu_weights.values())
    print(tf_score)
    final_score += tf_weights[timeframe] * tf_score

final_score /= sum(tf_weights.values())
final_score

0.22452
0.26435333333333333
0.3034


0.2509444444444444

In [123]:
df_a = get_artist_similarity(u1_a, u2_a, 'Short')
df_a.loc[df_a['score'] > 0]

Unnamed: 0,artist_id,artist,12153521253,12120382831,base,score
3,3mIj9lX2MWuHmhNCA7LSCW,The 1975,4.0,1.0,0.951814,0.826446
7,77SW9BnxLY8rJ0RciFqkHh,The Neighbourhood,8.0,34.0,0.694444,0.292184
8,1Xyo4u8uXC1ZmMpatF05PJ,The Weeknd,9.0,24.0,0.666389,0.390625
10,5cIc3SBFuBLVxJz58W2tU9,Oh Wonder,11.0,29.0,0.615148,0.336064
11,2h93pZq0e7k5yf4dywlkpM,Frank Ocean,12.0,11.0,0.615148,0.591716
13,5K4W6rqBFWDnAN6FQUkS6x,Kanye West,14.0,8.0,0.694444,0.548697
17,1Bl6wpkWCQ4KVgnASpvzzA,BROCKHAMPTON,18.0,40.0,0.475624,0.25


In [124]:
df_g = get_genre_similarity(u1_g, u2_g, 'Short')
df_g.loc[df_g['score'] > 0]

Unnamed: 0,genre,12153521253,12120382831,base,score
0,Pop,1.0,1.0,0.951814,0.951814
1,Electropop,2.0,16.0,0.907029,0.510204
2,Modern Rock,3.0,10.0,0.865333,0.64
6,Modern Alternative Rock,7.0,9.0,0.72431,0.666389
10,Nu Gaze,11.0,11.0,0.615148,0.615148
11,Rock,12.0,12.0,0.591716,0.591716
12,Hip Hop,13.0,8.0,0.694444,0.569598
13,Rap,14.0,4.0,0.826446,0.548697
25,Alternative R&B,26.0,23.0,0.403124,0.367309
26,Lgbtq+ Hip Hop,27.0,39.0,0.356427,0.256369


### Extract data from PROD and update to DEV

In [8]:
engine = create_engine(postgres_db)
user = pd.read_sql('select * from "Users"', engine)
user_profiles = pd.read_sql('select * from "UserProfiles"', engine)
top_artists = pd.read_sql('select * from "TopArtists"', engine)
top_tracks = pd.read_sql('select * from "TopTracks"', engine)
top_genres = pd.read_sql('select * from "TopGenres"', engine)
music_features = pd.read_sql('select * from "MusicFeatures"', engine)
artists = pd.read_sql('select * from "Artists"', engine)
tracks = pd.read_sql('select * from "Tracks"', engine)
engine.dispose()

In [9]:
engine = create_engine(localhost_db)
user.to_sql('Users', engine, index=False, if_exists='replace')
user_profiles.to_sql('UserProfiles', engine, index=False, if_exists='replace')
top_artists[['user_id', 'rank', 'artist_id', 'timeframe']].to_sql('TopArtists', engine, index=False, if_exists='replace')
top_tracks[['user_id', 'rank', 'track_id', 'timeframe']].to_sql('TopTracks', engine, index=False, if_exists='replace')
top_genres.to_sql('TopGenres', engine, index=False, if_exists='replace')
music_features.to_sql('MusicFeatures', engine, index=False, if_exists='replace')
artists.to_sql('Artists', engine, index=False, if_exists='replace')
tracks.to_sql('Tracks', engine, index=False, if_exists='replace')
engine.dispose()

In [13]:
def compare_users(u1, u2):
    # Configs
    tf_weights = {'Short': 6, 'Medium': 5, 'Long': 4}
    mu_weights = {'artist': 4, 'track': 1, 'genre': 8, 'feature': 2}
    # Get data
    engine = create_engine(localhost_db)
    users = pd.read_sql(users2_query, engine, params={'user_ids': (u1, u2)})
    df_a = pd.read_sql(top_artists2_query, engine, params={'user_ids': (u1, u2)})
    df_t = pd.read_sql(top_tracks2_query, engine, params={'user_ids': (u1, u2)})
    df_g = pd.read_sql(top_genres2_query, engine, params={'user_ids': (u1, u2)})
    df_m = pd.read_sql(music_features2_query, engine, params={'user_ids': (u1, u2)})
    # User 1
    u1_a = df_a.loc[df_a['user_id'] == u1]
    u1_t = df_t.loc[df_t['user_id'] == u1]
    u1_g = df_g.loc[df_g['user_id'] == u1]
    u1_m = df_m.loc[df_m['user_id'] == u1]
    # User 2
    u2_a = df_a.loc[df_a['user_id'] == u2]
    u2_t = df_t.loc[df_t['user_id'] == u2]
    u2_g = df_g.loc[df_g['user_id'] == u2]
    u2_m = df_m.loc[df_m['user_id'] == u2]
    
    final_score = 0
    similar_artists = pd.DataFrame()
    similar_tracks = pd.DataFrame()
    similar_genres = pd.DataFrame()
    
    name1 = users.loc[users['user_id'] == u1]['display_name'].unique().item()
    name2 = users.loc[users['user_id'] == u2]['display_name'].unique().item()
    print('Comparing {} and {}...'.format(name1, name2))

    for timeframe in ['Short', 'Medium', 'Long']:
        tf_score = 0
        # Artist
        df_artist = get_artist_similarity(u1_a, u2_a, timeframe)
        tf_score += mu_weights['artist'] * calculate_similarity(df_artist)
        similar_artists = similar_artists.append(df_artist.loc[df_artist['score'] > 0])
        # Track
        df_track = get_track_similarity(u1_t, u2_t, timeframe)
        tf_score += mu_weights['track'] * calculate_similarity(df_track)
        similar_tracks = similar_tracks.append(df_track.loc[df_track['score'] > 0])
        # Genre
        df_genre = get_genre_similarity(u1_g, u2_g, timeframe)
        tf_score += mu_weights['genre'] * calculate_similarity(df_genre)
        similar_genres = similar_genres.append(df_genre.loc[df_genre['score'] > 0])
        # Features
        tf_score += mu_weights['feature'] * calculate_feature_similarity(u1_m, u2_m)
        # Timeframe overall score
        tf_score /= sum(mu_weights.values())
        print('{} term music taste similarity: {:.2f}'.format(timeframe, tf_score * 100))
        final_score += tf_weights[timeframe] * tf_score

    final_score /= sum(tf_weights.values())
    print('Overall music taste similarity: {:.2f}'.format(final_score * 100))
    
    return final_score, similar_artists, similar_tracks, similar_genres


def get_artist_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe]
    df2 = u2.loc[u2['timeframe'] == timeframe]
    df = df1.merge(df2, on=['artist_id', 'timeframe'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    # df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_track_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe]
    df2 = u2.loc[u2['timeframe'] == timeframe]
    df = df1.merge(df2, on=['track_id', 'timeframe'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    # df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_genre_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe]
    df2 = u2.loc[u2['timeframe'] == timeframe]
    df = df1.merge(df2, on=['genre', 'timeframe'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].max(axis=1))
    df['score'] = df['score'].fillna(0)
    # df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def calculate_similarity(df):
    return round(df.sum()['score'] / df.sum()['base'], 4)

def calculate_feature_similarity(u1, u2, timeframe='Long'):
    features1 = u1.loc[u1['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    features2 = u2.loc[u2['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    scores = []
    for i in range(len(features1)):
        f1 = abs(features1[i])
        f2 = abs(features2[i])
        scores.append(min(f1, f2) / max(f1, f2))
    return round(sum(scores) / len(scores), 4)

def calculate_score(rank, weight=16, shift=4):
    return weight / ((0.1 * rank + shift) ** 2) 

In [14]:
user_ids = ['12120382831', '12153521253', '1279967390', '12179805550']

for i in range(len(user_ids)):
    for j in range(i+1, len(user_ids)):
        user_id1 = user_ids[i]
        user_id2 = user_ids[j]
        compare_users(user_id1, user_id2)
        print()

Comparing Bin Xuan Kong and Thivya Dharishinie...
Short term music taste similarity: 22.45
Medium term music taste similarity: 26.44
Long term music taste similarity: 30.34
Overall music taste similarity: 25.88

Comparing Bin Xuan Kong and Jae Sheng Ang...
Short term music taste similarity: 18.56
Medium term music taste similarity: 28.53
Long term music taste similarity: 28.14
Overall music taste similarity: 24.44

Comparing Bin Xuan Kong and Clement Tan...
Short term music taste similarity: 26.68
Medium term music taste similarity: 28.31
Long term music taste similarity: 28.86
Overall music taste similarity: 27.80

Comparing Thivya Dharishinie and Jae Sheng Ang...
Short term music taste similarity: 17.09
Medium term music taste similarity: 19.92
Long term music taste similarity: 26.58
Overall music taste similarity: 20.56

Comparing Thivya Dharishinie and Clement Tan...
Short term music taste similarity: 15.33
Medium term music taste similarity: 22.58
Long term music taste similarity:

In [15]:
s, a, t, g = compare_users(user_ids[0], user_ids[1])

Comparing Bin Xuan Kong and Thivya Dharishinie...
Short term music taste similarity: 22.45
Medium term music taste similarity: 26.44
Long term music taste similarity: 30.34
Overall music taste similarity: 25.88


In [16]:
a.head()

Unnamed: 0,user_id_x,rank_x,artist_id,timeframe,user_id_y,rank_y,base,score
0,12120382831,1.0,3mIj9lX2MWuHmhNCA7LSCW,Short,12153521253,4.0,0.951814,0.826446
7,12120382831,8.0,5K4W6rqBFWDnAN6FQUkS6x,Short,12153521253,14.0,0.694444,0.548697
10,12120382831,11.0,2h93pZq0e7k5yf4dywlkpM,Short,12153521253,12.0,0.615148,0.591716
23,12120382831,24.0,1Xyo4u8uXC1ZmMpatF05PJ,Short,12153521253,9.0,0.666389,0.390625
28,12120382831,29.0,5cIc3SBFuBLVxJz58W2tU9,Short,12153521253,11.0,0.615148,0.336064


In [17]:
def get_similar_artists(df_a):
    engine = create_engine(localhost_db)
    artists = pd.read_sql_query(similar_artists_query, engine, params={'artist_ids': tuple(df_a['artist_id'].tolist())})
    engine.dispose()
    df = df_a.merge(artists, on=['artist_id'])
    return df

def get_similar_tracks(df_t):
    engine = create_engine(localhost_db)
    tracks = pd.read_sql_query(similar_tracks_query, engine, params={'track_ids': tuple(df_t['track_id'].tolist())})
    engine.dispose()
    df = df_t.merge(tracks, on=['track_id'])
    return df

In [18]:
get_similar_artists(a)

Unnamed: 0,user_id_x,rank_x,artist_id,timeframe,user_id_y,rank_y,base,score,artist,artist_url,artist_image
0,12120382831,1.0,3mIj9lX2MWuHmhNCA7LSCW,Short,12153521253,4.0,0.951814,0.826446,The 1975,https://open.spotify.com/artist/3mIj9lX2MWuHmh...,https://i.scdn.co/image/94dd2feca73bdfb7e1c127...
1,12120382831,2.0,3mIj9lX2MWuHmhNCA7LSCW,Medium,12153521253,2.0,0.907029,0.907029,The 1975,https://open.spotify.com/artist/3mIj9lX2MWuHmh...,https://i.scdn.co/image/94dd2feca73bdfb7e1c127...
2,12120382831,20.0,3mIj9lX2MWuHmhNCA7LSCW,Long,12153521253,19.0,0.459638,0.444444,The 1975,https://open.spotify.com/artist/3mIj9lX2MWuHmh...,https://i.scdn.co/image/94dd2feca73bdfb7e1c127...
3,12120382831,8.0,5K4W6rqBFWDnAN6FQUkS6x,Short,12153521253,14.0,0.694444,0.548697,Kanye West,https://open.spotify.com/artist/5K4W6rqBFWDnAN...,https://i.scdn.co/image/bd1c6fdf3705cf9b7d0c8a...
4,12120382831,5.0,5K4W6rqBFWDnAN6FQUkS6x,Medium,12153521253,35.0,0.790123,0.284444,Kanye West,https://open.spotify.com/artist/5K4W6rqBFWDnAN...,https://i.scdn.co/image/bd1c6fdf3705cf9b7d0c8a...
5,12120382831,1.0,5K4W6rqBFWDnAN6FQUkS6x,Long,12153521253,6.0,0.951814,0.756144,Kanye West,https://open.spotify.com/artist/5K4W6rqBFWDnAN...,https://i.scdn.co/image/bd1c6fdf3705cf9b7d0c8a...
6,12120382831,11.0,2h93pZq0e7k5yf4dywlkpM,Short,12153521253,12.0,0.615148,0.591716,Frank Ocean,https://open.spotify.com/artist/2h93pZq0e7k5yf...,https://i.scdn.co/image/7db34c8aace6feb91f3860...
7,12120382831,8.0,2h93pZq0e7k5yf4dywlkpM,Medium,12153521253,8.0,0.694444,0.694444,Frank Ocean,https://open.spotify.com/artist/2h93pZq0e7k5yf...,https://i.scdn.co/image/7db34c8aace6feb91f3860...
8,12120382831,21.0,2h93pZq0e7k5yf4dywlkpM,Long,12153521253,9.0,0.666389,0.429992,Frank Ocean,https://open.spotify.com/artist/2h93pZq0e7k5yf...,https://i.scdn.co/image/7db34c8aace6feb91f3860...
9,12120382831,24.0,1Xyo4u8uXC1ZmMpatF05PJ,Short,12153521253,9.0,0.666389,0.390625,The Weeknd,https://open.spotify.com/artist/1Xyo4u8uXC1ZmM...,https://i.scdn.co/image/d9a875c37277c35b94c60c...


In [19]:
get_similar_tracks(t)

Unnamed: 0,user_id_x,rank_x,track_id,timeframe,user_id_y,rank_y,base,score,track,artists,album,track_url,album_image
0,12120382831,20.0,73jVPicY2G9YHmzgjk69ae,Medium,12153521253,32.0,0.444444,0.308642,Robbers,The 1975,The 1975,https://open.spotify.com/track/73jVPicY2G9YHmz...,https://i.scdn.co/image/ab67616d0000b27304f21e...


In [20]:
g

Unnamed: 0,user_id_x,rank_x,genre,points_x,timeframe,user_id_y,rank_y,points_y,base,score
0,12120382831,1.0,Pop,4.041452,Short,12153521253,1.0,5.230431,0.951814,0.951814
3,12120382831,4.0,Rap,2.002428,Short,12153521253,14.0,0.574837,0.826446,0.548697
7,12120382831,8.0,Hip Hop,1.140220,Short,12153521253,13.0,0.617077,0.694444,0.569598
8,12120382831,9.0,Modern Alternative Rock,1.123967,Short,12153521253,7.0,1.048123,0.724310,0.666389
9,12120382831,10.0,Modern Rock,1.123967,Short,12153521253,3.0,1.617012,0.865333,0.640000
...,...,...,...,...,...,...,...,...,...,...
45,12120382831,46.0,Modern Alternative Rock,0.142222,Long,12153521253,41.0,0.152200,0.243865,0.216333
46,12120382831,47.0,Modern Rock,0.142222,Long,12153521253,29.0,0.347774,0.336064,0.211389
47,12120382831,48.0,Nu Gaze,0.142222,Long,12153521253,42.0,0.152200,0.237954,0.206612
48,12120382831,49.0,Rock,0.142222,Long,12153521253,43.0,0.152200,0.232254,0.201995
