In [2]:
import pandas as pd
import spotipy
from sqlalchemy import create_engine
from secrets import spotify_secrets, localhost_db2, postgres_db
from spotipy.oauth2 import SpotifyOAuth
from queries import *

In [54]:
RANGES = {'short_term': 0, 'medium_term': 1, 'long_term': 2}
LIMIT = 50

def get_top_artists_df(sp):
    user_id = sp.me()['id']
    top_list = []
    for r in RANGES:
        top_artists = sp.current_user_top_artists(time_range=r, limit=LIMIT)
        for i, a in enumerate(top_artists['items']):
            this_top = {
                'user_id': user_id,
                'rank': i+1,
                'artist_id': a['id'],
                'timeframe': RANGES[r],
                'artist': a['name'],
                'genres': "; ".join(g for g in a['genres']),
                'artist_url': a['external_urls']['spotify'],
                'artist_image': a['images'][0]['url'],
                'popularity': a['popularity'],
            }
            top_list.append(this_top)
    return pd.DataFrame.from_dict(top_list)

def get_top_tracks_df(sp):
    user_id = sp.me()['id']
    top_list = []
    audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', \
                      'liveness', 'valence', 'tempo']
    for r in RANGES:
        top_tracks = sp.current_user_top_tracks(time_range=r, limit=LIMIT)
        for i, t in enumerate(top_tracks['items']):
            this_top = {
                'user_id': user_id,
                'rank': i+1,
                'track_id': t['id'],
                'timeframe': RANGES[r],
                'track_id': t['id'],
                'track': t['name'],
                'artists': "; ".join(a['name'] for a in t['artists']),
                'album': t['album']['name'],
                'album_image': t['album']['images'][0]['url'],
                'release_date': t['album']['release_date'],
                'track_url': t['external_urls']['spotify'],
                'timeframe': RANGES[r]
            }
            top_list.append(this_top)
    return pd.DataFrame.from_dict(top_list)

def top_to_dict(top_df, shuffle=False):
    top_dict = {}
    for i in range(3):
        this_top = top_df.loc[top_df['timeframe'] == i].to_dict('records')
        if shuffle:
            this_top = this_top
            random.shuffle(this_top)
        top_dict[i] = this_top
    return top_dict

In [None]:
DATABASE_URL = localhost_db2
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=spotify_secrets["Client Id"],
                                               client_secret=spotify_secrets["Client Secret"],
                                               redirect_uri="http://localhost:8892/callback",
                                               scope="user-top-read"))

In [None]:
df_tt = get_top_tracks_df(sp)
df_tt.head()

In [None]:
def get_music_features_df(sp, top_tracks):
    audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', \
                      'liveness', 'valence', 'tempo']
    df_music = pd.DataFrame()
    df_feature = pd.DataFrame(columns=['user_id'] + audio_features + ['timeframe'])
    user_id = top_tracks[2][0]['user_id']
    for timeframe in RANGES.values():
        this_feature = {'user_id': user_id, 'timeframe': timeframe}
        all_features = sp.audio_features([t['track_id'] for t in top_tracks[timeframe]])
        try:
            for f in audio_features:
                this_feature[f] = sum(a[f] for a in all_features) / len(all_features)
            df_feature = df_feature.append(this_feature, ignore_index=True)
            df_music = df_music.append(pd.DataFrame.from_dict(all_features))
        except:
            pass
    df_music = df_music[['id'] + audio_features + ['key', 'mode', 'duration_ms', 'time_signature']]
    df_music = df_music.rename(columns={'id': 'track_id'}).drop_duplicates()
    return df_feature, df_music

In [None]:
top_tracks = top_to_dict(df_tt)
df_f, df_m = get_music_features_df(sp, top_tracks)
df_m.head()

In [None]:
df_m.describe()

In [None]:
engine = create_engine(DATABASE_URL)
df_m.to_sql('MusicFeatures', engine, index=False)

In [None]:
df_t = pd.read_sql('select * from "Tracks" limit 50', engine)
df_t.head()

In [None]:
audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', \
                      'liveness', 'valence', 'tempo']

all_features = sp.audio_features([t for t in df_t['track_id'].tolist()])
df_music = pd.DataFrame.from_dict(all_features)
df_music = df_music[['id'] + audio_features + ['key', 'mode', 'duration_ms', 'time_signature']]
df_music = df_music.rename(columns={'id': 'track_id'}).drop_duplicates()
df_music.head()

In [None]:
df_music.to_sql('TempFeatures', engine, index=False, if_exists='replace')

In [None]:
features_insert_query = """
INSERT INTO "MusicFeatures" (track_id, danceability, energy, loudness, speechiness, acousticness, instrumentalness,
    liveness, valence, tempo, key, mode, duration_ms, time_signature)
SELECT tf.track_id, tf.danceability, tf.energy, tf.loudness, tf.speechiness, tf.acousticness, tf.instrumentalness,
    tf.liveness, tf.valence, tf.tempo, tf.key, tf.mode, tf.duration_ms, tf.time_signature
FROM "TempFeatures" tf
ON CONFLICT (track_id) DO NOTHING
"""

In [None]:
engine.execute(features_insert_query)

In [None]:
df_users = pd.read_sql('select * from "Users"', engine)
df_users.head()

In [None]:
audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', \
                  'liveness', 'valence', 'tempo']
df_music = pd.DataFrame()

for u in df_users['user_id']:
    df_t = pd.read_sql('select * from "TopTracks" where user_id = %(user_id)s', engine, params={'user_id': u})
    track_ids = df_t['track_id'].unique().tolist()
    for i in range(3):
        if i == 1 and len(track_ids) < 50:
            break
        elif i == 2 and len(track_ids) < 100:
            break
        t = track_ids[i*50:i*50+50]
        all_features = sp.audio_features(t)
        df_music = df_music.append(pd.DataFrame.from_dict(all_features))

df_music = df_music[['id'] + audio_features + ['key', 'mode', 'duration_ms', 'time_signature']]
df_music = df_music.rename(columns={'id': 'track_id'}).drop_duplicates()

In [None]:
df_music

In [None]:
df_music.to_sql('TempFeatures', engine, index=False, if_exists='replace')

In [None]:
features_insert_query = """
INSERT INTO "MusicFeatures" (track_id, danceability, energy, loudness, speechiness, acousticness, instrumentalness,
    liveness, valence, tempo, key, mode, duration_ms, time_signature)
SELECT tf.track_id, tf.danceability, tf.energy, tf.loudness, tf.speechiness, tf.acousticness, tf.instrumentalness,
    tf.liveness, tf.valence, tf.tempo, tf.key, tf.mode, tf.duration_ms, tf.time_signature
FROM "TempFeatures" tf
ON CONFLICT (track_id) DO NOTHING
"""

In [None]:
engine.execute(features_insert_query)

In [3]:
def get_user_profile(user_id):
    try:
        engine = create_engine(DATABASE_URL)
        df_user = pd.read_sql_query(user_query, engine, params={'user_id': user_id})
        user_profile = df_user.to_dict('records')[0]
        engine.dispose()
        return user_profile
    except:
        return None

def get_user_top(user_id):
    engine = create_engine(DATABASE_URL)
    df_a = get_top_artists(user_id, engine)
    df_t = get_top_tracks(user_id, engine)
    df_g = get_top_genres(user_id, engine)
    df_m = get_music_features(user_id, engine)
    engine.dispose()
    return df_a, df_t, df_g, df_m

def get_top_artists(user_id, engine):
    df = pd.read_sql_query(top_artists_query, engine, params={'user_id': user_id})
    return df

def get_top_tracks(user_id, engine):
    df = pd.read_sql_query(top_tracks_query, engine, params={'user_id': user_id})
    return df

def get_top_genres(user_id, engine):
    df = pd.read_sql_query(top_genres_query, engine, params={'user_id': user_id})
    return df

def get_music_features(user_id, engine):
    df = pd.read_sql_query(music_features_query, engine, params={'user_id': user_id})
    return df

In [4]:
DATABASE_URL = localhost_db2
df_a, df_t, df_g, df_m = get_user_top('12120382831')
df_m.head()

Unnamed: 0,user_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,timeframe
0,12120382831,0.55864,0.55286,-7.4894,0.081956,0.329089,0.078903,0.155022,0.36188,119.1419,0
1,12120382831,0.51484,0.64952,-6.5105,0.06617,0.234219,0.038204,0.157544,0.397502,125.81876,1
2,12120382831,0.65416,0.6096,-6.82908,0.143112,0.235778,0.034798,0.163644,0.42572,126.49528,2


In [62]:
top_dict = {}
is_percent = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'liveness', 'valence']

for i in range(3):
    temp_dict = df_m.loc[df_m['timeframe'] == i].to_dict('records')[0]
    for p in is_percent:
        temp_dict[p] = round(temp_dict[p] *100)
    top_dict[i] = temp_dict

top_dict

{0: {'user_id': '12120382831',
  'danceability': 56,
  'energy': 55,
  'loudness': -7.489400000000001,
  'speechiness': 0.08195599999999999,
  'acousticness': 33,
  'instrumentalness': 8,
  'liveness': 16,
  'valence': 36,
  'tempo': 119.1419,
  'timeframe': 0},
 1: {'user_id': '12120382831',
  'danceability': 51,
  'energy': 65,
  'loudness': -6.5105,
  'speechiness': 0.06617,
  'acousticness': 23,
  'instrumentalness': 4,
  'liveness': 16,
  'valence': 40,
  'tempo': 125.81875999999998,
  'timeframe': 1},
 2: {'user_id': '12120382831',
  'danceability': 65,
  'energy': 61,
  'loudness': -6.82908,
  'speechiness': 0.14311199999999996,
  'acousticness': 24,
  'instrumentalness': 3,
  'liveness': 16,
  'valence': 43,
  'tempo': 126.49527999999998,
  'timeframe': 2}}

In [9]:
top_tracks_query = """
SELECT tt.rank, tt.track_id, tt.timeframe, t.track, t.artists, t.album, t.album_image, t.release_date, t.track_url,
       mf.danceability, mf.energy, mf.loudness, mf.acousticness, mf.instrumentalness, mf.liveness, mf.valence, mf.tempo
FROM "TopTracks" tt
JOIN "Tracks" t
ON tt.track_id = t.track_id
JOIN "MusicFeatures" mf
ON tt.track_id = mf.track_id
WHERE tt.user_id = %(user_id)s
ORDER BY tt.timeframe, tt.rank
"""

engine = create_engine(DATABASE_URL)
df_t = pd.read_sql_query(top_tracks_query, engine, params={'user_id': '12120382831'})
df_t.head()

Unnamed: 0,rank,track_id,timeframe,track,artists,album,album_image,release_date,track_url,danceability,energy,loudness,acousticness,instrumentalness,liveness,valence,tempo
0,1,2Kerz9H9IejzeIpjhDJoYG,0,Love,Lana Del Rey,Lust For Life,https://i.scdn.co/image/ab67616d0000b27395e2fd...,2017-07-21,https://open.spotify.com/track/2Kerz9H9IejzeIp...,0.527,0.366,-10.943,0.487,0.0023,0.11,0.234,98.994
1,2,73jVPicY2G9YHmzgjk69ae,0,Robbers,The 1975,The 1975,https://i.scdn.co/image/ab67616d0000b27304f21e...,2013-01-01,https://open.spotify.com/track/73jVPicY2G9YHmz...,0.621,0.692,-6.858,0.000363,1.7e-05,0.335,0.381,99.806
2,3,2p8IUWQDrpjuFltbdgLOag,0,After Hours,The Weeknd,After Hours,https://i.scdn.co/image/ab67616d0000b2738863bc...,2020-03-20,https://open.spotify.com/track/2p8IUWQDrpjuFlt...,0.664,0.572,-6.099,0.0811,0.00604,0.121,0.143,108.959
3,4,6ilc4vQcwMPlvAHFfsTGng,0,Sweet,Cigarettes After Sex,Cigarettes After Sex,https://i.scdn.co/image/ab67616d0000b27394d280...,2017-06-09,https://open.spotify.com/track/6ilc4vQcwMPlvAH...,0.45,0.511,-9.073,0.353,0.766,0.139,0.115,96.563
4,5,6Vigp41BietH0WoFZ52JI5,0,All We Do,Oh Wonder,Oh Wonder,https://i.scdn.co/image/ab67616d0000b2737cc94e...,2015-09-04,https://open.spotify.com/track/6Vigp41BietH0Wo...,0.59,0.242,-11.724,0.978,0.000124,0.0906,0.366,126.721


In [51]:
TF_WEIGHTS = {0: 3, 1: 2, 2: 1}

def get_most_features(df_t):
    to_keep = ['track_id', 'track', 'artists', 'album', 'album_image', 'track_url']
    overall_dict = {}
    for tf in TF_WEIGHTS.keys():
        this_dict = {'danceability': {}, 'energy': {}, 'loudness': {}, 'acousticness': {}, 'instrumentalness': {}, \
                     'liveness': {}, 'valence': {}, 'tempo': {}}
        df = df_t.loc[df_t['timeframe'] == tf]
        for feat in this_dict.keys():
            this_dict[feat]['min'] = df.loc[df[feat].idxmin()][to_keep].to_dict()
            this_dict[feat]['max'] = df.loc[df[feat].idxmax()][to_keep].to_dict()
        overall_dict[tf] = this_dict
    return overall_dict

In [45]:
this_dict = {'danceability': {}, 'energy': {}, 'loudness': {}, 'acousticness': {}, 'instrumentalness': {}, \
                     'liveness': {}, 'valence': {}, 'tempo': {}}
for feat in this_dict.keys():
        this_dict[feat]['min'] = 0
        this_dict[feat]['max'] = 1
this_dict

{'danceability': {'min': 0, 'max': 1},
 'energy': {'min': 0, 'max': 1},
 'loudness': {'min': 0, 'max': 1},
 'acousticness': {'min': 0, 'max': 1},
 'instrumentalness': {'min': 0, 'max': 1},
 'liveness': {'min': 0, 'max': 1},
 'valence': {'min': 0, 'max': 1},
 'tempo': {'min': 0, 'max': 1}}

In [52]:
get_most_features(df_t)

{0: {'danceability': {'min': {'track_id': '2rtGaCAeYtmcIvuZsvgTf6',
    'track': 'How to Disappear Completely',
    'artists': 'Radiohead',
    'album': 'Kid A',
    'album_image': 'https://i.scdn.co/image/ab67616d0000b273674c2b8b77e1e9259a2fcb87',
    'track_url': 'https://open.spotify.com/track/2rtGaCAeYtmcIvuZsvgTf6'},
   'max': {'track_id': '7m9OqQk4RVRkw9JJdeAw96',
    'track': 'Jocelyn Flores',
    'artists': 'XXXTENTACION',
    'album': '17',
    'album_image': 'https://i.scdn.co/image/ab67616d0000b273203c89bd4391468eea4cc3f5',
    'track_url': 'https://open.spotify.com/track/7m9OqQk4RVRkw9JJdeAw96'}},
  'energy': {'min': {'track_id': '5GUYJTQap5F3RDQiCOJhrS',
    'track': 'Self Control',
    'artists': 'Frank Ocean',
    'album': 'Blonde',
    'album_image': 'https://i.scdn.co/image/ab67616d0000b273c5649add07ed3720be9d5526',
    'track_url': 'https://open.spotify.com/track/5GUYJTQap5F3RDQiCOJhrS'},
   'max': {'track_id': '0w2kfnU1PFKxjmZFQ1J1X8',
    'track': 'Overthinking',
  