In [11]:
import pandas as pd
from secrets import localhost_db, postgres_db, localhost_db2
from sqlalchemy import create_engine
from queries import *

df_a = pd.read_csv('data/TopArtists.csv')
df_t = pd.read_csv('data/TopTracks.csv')
df_g = pd.read_csv('data/TopGenres.csv')
df_m = pd.read_csv('data/MusicFeatures.csv')

In [12]:
df_a['user_id'].unique()

array([12153521253, 12120382831])

In [13]:
u1_a = df_a.loc[df_a['user_id'] == 12153521253]
u1_t = df_t.loc[df_t['user_id'] == 12153521253]
u1_g = df_g.loc[df_g['user_id'] == 12153521253]
u1_m = df_m.loc[df_m['user_id'] == 12153521253]

In [14]:
u2_a = df_a.loc[df_a['user_id'] == 12120382831]
u2_t = df_t.loc[df_t['user_id'] == 12120382831]
u2_g = df_g.loc[df_g['user_id'] == 12120382831]
u2_m = df_m.loc[df_m['user_id'] == 12120382831]

In [15]:
def top_to_dict(top_df):
    top_dict = {}
    top_dict['Short'] = top_df.loc[top_df['timeframe'] == 'Short'].to_dict('records')
    top_dict['Medium'] = top_df.loc[top_df['timeframe'] == 'Medium'].to_dict('records')
    top_dict['Long'] = top_df.loc[top_df['timeframe'] == 'Long'].to_dict('records')
    return top_dict

In [16]:
def get_artist_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['artist_id', 'artist', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['artist_id', 'artist', 'rank']]
    df = df1.merge(df2, on=['artist_id', 'artist'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_track_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['track_id', 'track', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['track_id', 'track', 'rank']]
    df = df1.merge(df2, on=['track_id', 'track'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_genre_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe, ['genre', 'rank']]
    df2 = u2.loc[u2['timeframe'] == timeframe, ['genre', 'rank']]
    df = df1.merge(df2, on='genre', how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].max(axis=1))
    df['score'] = df['score'].fillna(0)
    df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def calculate_similarity(df):
    return round(df.sum()['score'] / df.sum()['base'], 4)

def calculate_feature_similarity(u1, u2, timeframe='Long'):
    features1 = u1.loc[u1['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    features2 = u2.loc[u2['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    scores = []
    for i in range(len(features1)):
        f1 = abs(features1[i])
        f2 = abs(features2[i])
        scores.append(min(f1, f2) / max(f1, f2))
    return round(sum(scores) / len(scores), 4)

def calculate_score(rank, weight=16, shift=4):
    return weight / ((0.1 * rank + shift) ** 2) 

In [17]:
tf_weights = {'Short': 3, 'Medium': 2, 'Long': 1}
mu_weights = {'artist': 4, 'track': 1, 'genre': 8, 'feature': 2}
final_score = 0

for timeframe in ['Short', 'Medium', 'Long']:
    tf_score = 0
    # Artist
    df_artist = get_artist_similarity(u1_a, u2_a, timeframe)
    tf_score += mu_weights['artist'] * calculate_similarity(df_artist)
    # Track
    df_track = get_track_similarity(u1_t, u2_t, timeframe)
    tf_score += mu_weights['track'] * calculate_similarity(df_track)
    # Genre
    df_genre = get_genre_similarity(u1_g, u2_g, timeframe)
    tf_score += mu_weights['genre'] * calculate_similarity(df_genre)
    # Features
    tf_score += mu_weights['feature'] * calculate_feature_similarity(u1_m, u2_m)
    # Timeframe overall score
    tf_score /= sum(mu_weights.values())
    print(tf_score)
    final_score += tf_weights[timeframe] * tf_score

final_score /= sum(tf_weights.values())
final_score

0.22452
0.26435333333333333
0.3034


0.2509444444444444

In [18]:
df_a = get_artist_similarity(u1_a, u2_a, 'Short')
df_a.loc[df_a['score'] > 0]

Unnamed: 0,artist_id,artist,12153521253,12120382831,base,score
3,3mIj9lX2MWuHmhNCA7LSCW,The 1975,4.0,1.0,0.951814,0.826446
7,77SW9BnxLY8rJ0RciFqkHh,The Neighbourhood,8.0,34.0,0.694444,0.292184
8,1Xyo4u8uXC1ZmMpatF05PJ,The Weeknd,9.0,24.0,0.666389,0.390625
10,5cIc3SBFuBLVxJz58W2tU9,Oh Wonder,11.0,29.0,0.615148,0.336064
11,2h93pZq0e7k5yf4dywlkpM,Frank Ocean,12.0,11.0,0.615148,0.591716
13,5K4W6rqBFWDnAN6FQUkS6x,Kanye West,14.0,8.0,0.694444,0.548697
17,1Bl6wpkWCQ4KVgnASpvzzA,BROCKHAMPTON,18.0,40.0,0.475624,0.25


In [19]:
df_g = get_genre_similarity(u1_g, u2_g, 'Short')
df_g.loc[df_g['score'] > 0]

Unnamed: 0,genre,12153521253,12120382831,base,score
0,Pop,1.0,1.0,0.951814,0.951814
1,Electropop,2.0,16.0,0.907029,0.510204
2,Modern Rock,3.0,10.0,0.865333,0.64
6,Modern Alternative Rock,7.0,9.0,0.72431,0.666389
10,Nu Gaze,11.0,11.0,0.615148,0.615148
11,Rock,12.0,12.0,0.591716,0.591716
12,Hip Hop,13.0,8.0,0.694444,0.569598
13,Rap,14.0,4.0,0.826446,0.548697
25,Alternative R&B,26.0,23.0,0.403124,0.367309
26,Lgbtq+ Hip Hop,27.0,39.0,0.356427,0.256369


### Extract data from PROD and update to DEV

In [20]:
"""engine = create_engine(postgres_db)
user = pd.read_sql('select * from "Users"', engine)
user_profiles = pd.read_sql('select * from "UserProfiles"', engine)
top_artists = pd.read_sql('select * from "TopArtists"', engine)
top_tracks = pd.read_sql('select * from "TopTracks"', engine)
top_genres = pd.read_sql('select * from "TopGenres"', engine)
music_features = pd.read_sql('select * from "MusicFeatures"', engine)
artists = pd.read_sql('select * from "Artists"', engine)
tracks = pd.read_sql('select * from "Tracks"', engine)
engine.dispose()"""

'engine = create_engine(postgres_db)\nuser = pd.read_sql(\'select * from "Users"\', engine)\nuser_profiles = pd.read_sql(\'select * from "UserProfiles"\', engine)\ntop_artists = pd.read_sql(\'select * from "TopArtists"\', engine)\ntop_tracks = pd.read_sql(\'select * from "TopTracks"\', engine)\ntop_genres = pd.read_sql(\'select * from "TopGenres"\', engine)\nmusic_features = pd.read_sql(\'select * from "MusicFeatures"\', engine)\nartists = pd.read_sql(\'select * from "Artists"\', engine)\ntracks = pd.read_sql(\'select * from "Tracks"\', engine)\nengine.dispose()'

In [21]:
"""engine = create_engine(localhost_db)
user.to_sql('Users', engine, index=False, if_exists='replace')
user_profiles.to_sql('UserProfiles', engine, index=False, if_exists='replace')
top_artists[['user_id', 'rank', 'artist_id', 'timeframe']].to_sql('TopArtists', engine, index=False, if_exists='replace')
top_tracks[['user_id', 'rank', 'track_id', 'timeframe']].to_sql('TopTracks', engine, index=False, if_exists='replace')
top_genres.to_sql('TopGenres', engine, index=False, if_exists='replace')
music_features.to_sql('MusicFeatures', engine, index=False, if_exists='replace')
artists.to_sql('Artists', engine, index=False, if_exists='replace')
tracks.to_sql('Tracks', engine, index=False, if_exists='replace')
engine.dispose()"""

"engine = create_engine(localhost_db)\nuser.to_sql('Users', engine, index=False, if_exists='replace')\nuser_profiles.to_sql('UserProfiles', engine, index=False, if_exists='replace')\ntop_artists[['user_id', 'rank', 'artist_id', 'timeframe']].to_sql('TopArtists', engine, index=False, if_exists='replace')\ntop_tracks[['user_id', 'rank', 'track_id', 'timeframe']].to_sql('TopTracks', engine, index=False, if_exists='replace')\ntop_genres.to_sql('TopGenres', engine, index=False, if_exists='replace')\nmusic_features.to_sql('MusicFeatures', engine, index=False, if_exists='replace')\nartists.to_sql('Artists', engine, index=False, if_exists='replace')\ntracks.to_sql('Tracks', engine, index=False, if_exists='replace')\nengine.dispose()"

In [22]:
def compare_users(u1, u2):
    # Configs
    tf_weights = {'Short': 6, 'Medium': 5, 'Long': 4}
    mu_weights = {'artist': 4, 'track': 1, 'genre': 8, 'feature': 2}
    # Get data
    engine = create_engine(localhost_db)
    users = pd.read_sql(users2_query, engine, params={'user_ids': (u1, u2)})
    df_a = pd.read_sql(top_artists2_query, engine, params={'user_ids': (u1, u2)})
    df_t = pd.read_sql(top_tracks2_query, engine, params={'user_ids': (u1, u2)})
    df_g = pd.read_sql(top_genres2_query, engine, params={'user_ids': (u1, u2)})
    df_m = pd.read_sql(music_features2_query, engine, params={'user_ids': (u1, u2)})
    # User 1
    u1_a = df_a.loc[df_a['user_id'] == u1]
    u1_t = df_t.loc[df_t['user_id'] == u1]
    u1_g = df_g.loc[df_g['user_id'] == u1]
    u1_m = df_m.loc[df_m['user_id'] == u1]
    # User 2
    u2_a = df_a.loc[df_a['user_id'] == u2]
    u2_t = df_t.loc[df_t['user_id'] == u2]
    u2_g = df_g.loc[df_g['user_id'] == u2]
    u2_m = df_m.loc[df_m['user_id'] == u2]
    
    final_score = 0
    similar_artists = pd.DataFrame()
    similar_tracks = pd.DataFrame()
    similar_genres = pd.DataFrame()
    
    name1 = users.loc[users['user_id'] == u1]['display_name'].unique().item()
    name2 = users.loc[users['user_id'] == u2]['display_name'].unique().item()
    print('Comparing {} and {}...'.format(name1, name2))

    for timeframe in ['Short', 'Medium', 'Long']:
        tf_score = 0
        # Artist
        df_artist = get_artist_similarity(u1_a, u2_a, timeframe)
        tf_score += mu_weights['artist'] * calculate_similarity(df_artist)
        df_artist = df_artist.loc[df_artist['score'] > 0]
        df_artist['rank'] = df_artist.reset_index().index + 1
        similar_artists = similar_artists.append(df_artist)
        # Track
        df_track = get_track_similarity(u1_t, u2_t, timeframe)
        tf_score += mu_weights['track'] * calculate_similarity(df_track)
        similar_tracks = similar_tracks.append(df_track.loc[df_track['score'] > 0])
        # Genre
        df_genre = get_genre_similarity(u1_g, u2_g, timeframe)
        tf_score += mu_weights['genre'] * calculate_similarity(df_genre)
        similar_genres = similar_genres.append(df_genre.loc[df_genre['score'] > 0])
        # Features
        tf_score += mu_weights['feature'] * calculate_feature_similarity(u1_m, u2_m)
        # Timeframe overall score
        tf_score /= sum(mu_weights.values())
        print('{} term music taste similarity: {:.2f}'.format(timeframe, tf_score * 100))
        final_score += tf_weights[timeframe] * tf_score

    final_score /= sum(tf_weights.values())
    print('Overall music taste similarity: {:.2f}'.format(final_score * 100))
    
    return final_score, users, similar_artists, similar_tracks, similar_genres


def get_artist_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe]
    df2 = u2.loc[u2['timeframe'] == timeframe]
    df = df1.merge(df2, on=['artist_id', 'timeframe'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    # df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_track_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe]
    df2 = u2.loc[u2['timeframe'] == timeframe]
    df = df1.merge(df2, on=['track_id', 'timeframe'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[['rank_x', 'rank_y']].max(axis=1))
    df['score'] = df['score'].fillna(0)
    # df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def get_genre_similarity(u1, u2, timeframe='Long'):
    df1 = u1.loc[u1['timeframe'] == timeframe]
    df2 = u2.loc[u2['timeframe'] == timeframe]
    df = df1.merge(df2, on=['genre', 'timeframe'], how='outer').fillna(0)
    df['base'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].min(axis=1))
    df.loc[(df['rank_x'] != 0) & (df['rank_y'] != 0), 'score'] = calculate_score(df[df[['rank_x', 'rank_y']] > 0].max(axis=1))
    df['score'] = df['score'].fillna(0)
    # df = df.rename(columns={'rank_x': u1['user_id'].unique()[0], 'rank_y': u2['user_id'].unique()[0]})
    return df

def calculate_similarity(df):
    return round(df.sum()['score'] / df.sum()['base'], 4)

def calculate_feature_similarity(u1, u2, timeframe='Long'):
    features1 = u1.loc[u1['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    features2 = u2.loc[u2['timeframe'] == timeframe].drop(columns=['user_id', 'timeframe']).values.tolist()[0]
    scores = []
    for i in range(len(features1)):
        f1 = abs(features1[i])
        f2 = abs(features2[i])
        scores.append(min(f1, f2) / max(f1, f2))
    return round(sum(scores) / len(scores), 4)

def calculate_score(rank, weight=16, shift=4):
    return weight / ((0.1 * rank + shift) ** 2) 

In [23]:
user_ids = ['12120382831', '12153521253', '1279967390', '12179805550']

for i in range(len(user_ids)):
    for j in range(i+1, len(user_ids)):
        user_id1 = user_ids[i]
        user_id2 = user_ids[j]
        compare_users(user_id1, user_id2)
        print()

ProgrammingError: (psycopg2.errors.UndefinedTable) relation "TopFeatures" does not exist
LINE 3: FROM "TopFeatures" tf
             ^

[SQL: 
SELECT *
FROM "TopFeatures" tf
WHERE tf.user_id in %(user_ids)s
]
[parameters: {'user_ids': ('12120382831', '12153521253')}]
(Background on this error at: http://sqlalche.me/e/13/f405)

In [None]:
s, u, a, t, g = compare_users(user_ids[0], user_ids[1])

In [None]:
a.head()

In [None]:
def get_similar_artists(df_a):
    engine = create_engine(localhost_db)
    artists = pd.read_sql_query(similar_artists_query, engine, params={'artist_ids': tuple(df_a['artist_id'].tolist())})
    engine.dispose()
    df = df_a.merge(artists, on=['artist_id'])
    return df.sort_values(['timeframe', 'rank'])

def get_similar_tracks(df_t):
    engine = create_engine(localhost_db)
    tracks = pd.read_sql_query(similar_tracks_query, engine, params={'track_ids': tuple(df_t['track_id'].tolist())})
    engine.dispose()
    df = df_t.merge(tracks, on=['track_id'])
    return df.sort_values(['timeframe', 'score'])

In [None]:
df = get_similar_artists(a)
df.head()

In [None]:
x = df.loc[df['timeframe'] == 'Short']
x

In [None]:
x['rank'] = x.reset_index().index + 1
x

In [None]:
get_similar_tracks(t)

In [None]:
g

In [None]:
DATABASE_URL = localhost_db
code = 'good-plane-60'

ADJECTIVES = ['good', 'bad', 'new', 'old', 'first', 'last', 'long', 'short', 'little', 'big', \
              'right', 'wrong', 'high', 'short', 'large', 'small', 'different', 'same', 'best', 'worst', \
              'easy', 'difficult', 'soft', 'hard', 'major', 'minor', 'public', 'private', 'real', 'fake', \
              'red', 'orange', 'yellow', 'green', 'blue', 'purple', 'black', 'white', 'grey', 'brown', \
              'pink', 'violet', 'indigo', 'silver', 'gold', 'teal', 'lime', 'maroon', 'olive', 'cyan']

NOUNS = ['time', 'year', 'month', 'day', 'week', 'thing', 'man', 'woman', 'boy', 'girl', \
         'world', 'life', 'eye', 'nose', 'ear', 'mouth', 'hair', 'hand', 'foot', 'leg', \
         'car', 'bus', 'bike', 'train', 'plane', 'boat', 'ship', 'tank', 'truck', 'taxi', \
         'dog', 'cat', 'mouse', 'cow', 'goat', 'horse', 'deer', 'rabbit', 'bird', 'monkey', \
         'bee', 'bear', 'chicken', 'fox', 'panda', 'frog', 'tiger', 'lion', 'duck', 'wolf']

code_parts = code.split('-')
if len(code_parts) != 3 or code_parts[0] not in ADJECTIVES or code_parts[1] not in NOUNS or \
    not (0 < int(code_parts[2]) < 100):
    print('con 1 fail')
engine = create_engine(DATABASE_URL)
df_u = pd.read_sql('SELECT user_id FROM "UserProfiles" WHERE code = %(code)s', engine, params={'code': code})
if len(df_u) != 1:
    print('con 2 fail')
df_u['user_id'].item()