In [1]:
import os
import time
import pickle
import pandas as pd
import numpy as np

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

import random
random.seed(1333)

from dotenv import load_dotenv
load_dotenv()


True

## authenticating the spotify client
your can get your clientId and secret from http://developers.spotify.com/

In [2]:
cid = os.environ["SPOTIFY_CID"]
secret = os.environ["SPOTIFY_SECRET"]   
username = os.environ["SPOTIFY_USERNAME"]   
scope = 'user-library-read playlist-modify-public playlist-read-private user-read-currently-playing user-read-recently-played user-top-read'
redirect_uri = 'http://localhost:8000/connect'

token = util.prompt_for_user_token(username, scope, cid, secret, redirect_uri)
sp = spotipy.Spotify(auth=token)

## trying out endpoints

### top tracks 

In [3]:
top_tracks_data =sp.current_user_top_tracks(limit=20, offset=0, time_range='medium_term')
top_tracks_df = pd.DataFrame(columns=["track_id","track_name"])

for track in top_tracks_data["items"]:
    top_tracks_df = top_tracks_df.append( {
                "track_id": track["id"],
                "track_name": track["name"],
               }
            ,ignore_index=True)

    
top_tracks_df.head(5)

Unnamed: 0,track_id,track_name
0,3fnqNxjN7o0tJe7zOQZV68,Catastrophist
1,7kF76Dlhew7jaUxaBIZpIQ,Nomad
2,3kBD2xHIqKWXjLAGidDTSz,Benz Truck (гелик)
3,3mpZzg6fdRM1uUlkLZafVH,Cross Off
4,5DJTYOAHZEKjNqXpOwnomi,None Shall Pass


### Audio features

In [4]:
track = pd.DataFrame(sp.audio_features("1Mxhti2c2uN1hhjN9kxLqY"))
track_features = track.drop(["type","id","uri","track_href","analysis_url"],axis=1)

track_features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.392,0.95,8,-3.52,0,0.0922,9.5e-05,0.000252,0.188,0.631,172.963,206373,4


## Getting playlists for a variety of genres

In [23]:
# http://everynoise.com/everynoise1d.cgi?scope=all

generes  = {'pop':['pop','dance pop', 'post-teen pop', 'electropop', 'tropical house'],
            'hip hop':['hip hop','rap','gangster rap','southern hip hop','hardcore hip hop','conscious hip hop','east coast hip hop','alternative hip hop'],            
            'rock':['rock','classic rock', 'permanent wave', 'hard rock', 'modern rock','alternative rock'],
            'edm':['edm','electro house', 'big room', 'pop edm', 'progressive electro house'],
            'latin':['tropical', 'latin pop', 'reggaeton', 'latin hip hop'],
            'metal':['metal','speed metal','hard rock','death metal','thrash metal','power metal']
            }


genres_subgenres = [(g,sg) for g in generes.keys() for sg in generes[g][:3]]
genres_subgenres

[('pop', 'pop'),
 ('pop', 'dance pop'),
 ('pop', 'post-teen pop'),
 ('hip hop', 'hip hop'),
 ('hip hop', 'rap'),
 ('hip hop', 'gangster rap'),
 ('rock', 'rock'),
 ('rock', 'classic rock'),
 ('rock', 'permanent wave'),
 ('edm', 'edm'),
 ('edm', 'electro house'),
 ('edm', 'big room'),
 ('latin', 'tropical'),
 ('latin', 'latin pop'),
 ('latin', 'reggaeton'),
 ('metal', 'metal'),
 ('metal', 'speed metal'),
 ('metal', 'hard rock')]

In [24]:
def build_playlists():
    playlists = pd.DataFrame()
    for genre,subgenre in genres_subgenres:
        subgenre_playlists = sp.search(subgenre, limit=2, offset=0, type='playlist', market="US")

        df = pd.DataFrame(subgenre_playlists["playlists"])
        df = df[["items"]]
        df["playlist_id"] = df["items"].apply(lambda x:x["id"])
        df["playlist_name"] = df["items"].apply(lambda x:x["name"])
        df["playlist_genre"], df["playlist_subgenre"] = genre , subgenre
        df.drop("items",axis=1,inplace=True)
        playlists = pd.concat([playlists,df],ignore_index=True)

    playlists.drop_duplicates(subset=["playlist_id"],inplace=True)
    
    return playlists

# Building dataframes from tracks, artists, and features

In [25]:
def build_dataframes(playlists):
    tracks = pd.DataFrame(columns=["playlist_id","track_id","track_name","track_popularity"])
    artists = pd.DataFrame(columns=["track_id","artist_id","artist_name"])
    features = pd.DataFrame()
    
    for playlist_id in playlists["playlist_id"]:

        # getting tracks 
        playlists_tracks = pd.DataFrame(sp.playlist_tracks(playlist_id))["items"]
        for track in playlists_tracks:
            tracks = tracks.append( {
                            "playlist_id": playlist_id,
                            "track_id": track["track"]["id"],
                            "track_name": track["track"]["name"],
                            "track_popularity": track["track"]["popularity"]
                           }
                        ,ignore_index=True)


        # getting artists 
        playlists_tracks = pd.DataFrame(sp.playlist_tracks(playlist_id))["items"]
        for track in playlists_tracks:
            for artist in  track["track"]["artists"]:
                artists = artists.append( {
                                "track_id": track["track"]["id"],
                                "artist_id": artist["id"],
                                "artist_name": artist["name"]
                               }
                            ,ignore_index=True)



        # getting features 
        tracks_of_playlist = tracks[(~tracks["track_id"].isnull()) &
                                    (tracks["playlist_id"]==playlist_id) 
                                   ]["track_id"]
        
        try:
            features = pd.concat([features, 
                                  pd.DataFrame(sp.audio_features(tracks_of_playlist))
                                ])
        except:
            pass
    return tracks,artists,features


## Merging our dataframes on track_id

In [26]:
def merge_dataframes(tracks,playlists,features):
    songs = tracks.merge(playlists, how="left",on="playlist_id")
    songs = songs.merge(features, how="left",left_on="track_id",right_on="id")
    songs.drop(["id","uri","track_href","analysis_url","type","analysis_url"],axis=1,inplace=True)
    
    return songs

# Buidling everything, or loading from pickle/mongo

In [27]:
if "songs.pkl" in os.listdir():
    with open("songs.pkl","rb") as f:
        songs = pickle.load(f)
    
else:
    playlists = build_playlists()
    tracks,artists,features = build_dataframes(playlists)
    songs = merge_dataframes(tracks,playlists,features)

    with open("songs.pkl","wb") as f:
        pickle.dump(songs,f)

        
songs.sample(3)

Unnamed: 0,playlist_id,track_id,track_name,track_popularity,playlist_name,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
259,37i9dQZF1DWZQaaqNMbbXa,0ZXdzaT1k688dkpNeEgQiV,I Don't Know Why,77,Dance Pop,pop,dance pop,0.499,0.898,8.0,-4.181,0.0,0.124,0.124,7e-06,0.0832,0.618,120.04,207668.0,4.0
595,37i9dQZF1DWY6tYEFs22tT,4Hpib09wXgD84w4uwTPnYD,Tap In,82,Hip-Hop Central,hip hop,hip hop,0.954,0.696,10.0,-7.117,0.0,0.204,0.00576,0.000498,0.0754,0.432,100.036,139413.0,4.0
624,2ZuNK8V8pJMnBRiNxBcGN3,0c7ec19W1r6eqpqOsbOjkt,Just A Lil Bit,0,Crusing hip-hop,hip hop,hip hop,0.697,0.626,10.0,-8.957,0.0,0.338,0.041,0.0044,0.333,0.505,96.588,237707.0,4.0


## I used mongo since I needed to query it as user taste 

In [28]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["spotify"]
collection = db["user_taste"]

features_dict = songs.to_dict("list")
features_dict["type"] = "all"
features_dict["last_update"] = time.time()

x= collection.update_one({"type":"all"},{"$set":features_dict},upsert=True)

