## Dependencies

In [1]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools
import pyarrow

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

## 1. Data Exploration/Preparation

Download datasets here:
https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks

In [2]:
spotify_df = pd.read_csv('../data_o.csv')
data_w_genre = pd.read_csv('../data_by_artist_o.csv')

In [3]:
data_w_genre['genres_upd'] = data_w_genre['genres'].apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])

In [4]:
spotify_df['artists_upd_v1'] = spotify_df['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))


spotify_df['artists_upd_v2'] = spotify_df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
spotify_df['artists_upd'] = np.where(spotify_df['artists_upd_v1'].apply(lambda x: not x), spotify_df['artists_upd_v2'], spotify_df['artists_upd_v1'] )

#need to create my own song identifier because there are duplicates of the same song with different ids. I see different
spotify_df['artists_song'] = spotify_df.apply(lambda row: row['artists_upd'][0]+row['name'],axis = 1)

spotify_df.sort_values(['artists_song','release_date'], ascending = False, inplace = True)

# spotify_df.drop_duplicates('artists_song',inplace = True)

artists_exploded = spotify_df[['artists_upd','id']].explode('artists_upd')

artists_exploded_enriched = artists_exploded.merge(data_w_genre, how = 'left', left_on = 'artists_upd',right_on = 'artists')
artists_exploded_enriched_nonnull = artists_exploded_enriched[~artists_exploded_enriched.genres_upd.isnull()]

In [5]:
artists_genres_consolidated = artists_exploded_enriched_nonnull.groupby('id')['genres_upd'].apply(list).reset_index()

artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['genres_upd'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))

spotify_df = spotify_df.merge(artists_genres_consolidated[['id','consolidates_genre_lists']], on = 'id',how = 'left')

spotify_df['year'] = spotify_df['release_date'].apply(lambda x: x.split('-')[0])

float_cols = spotify_df.dtypes[spotify_df.dtypes == 'float64'].index.values

ohe_cols = 'popularity'

# create 5 point buckets for popularity 
spotify_df['popularity_red'] = spotify_df['popularity'].apply(lambda x: int(x/5))

# tfidf can't handle nulls so fill any null values with an empty list
spotify_df['consolidates_genre_lists'] = spotify_df['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

In [6]:
#simple function to create OHE features
#this gets passed later on
def ohe_prep(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df


In [7]:
#function to build entire feature set
def create_feature_set(df, float_cols):
    """ 
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled 
        
    Returns: 
        final: final set of features 
    """
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    #explicity_ohe = ohe_prep(df, 'explicit','exp')    
    year_ohe = ohe_prep(df, 'year','year') * 0.5
    popularity_ohe = ohe_prep(df, 'popularity_red','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe, year_ohe], axis = 1)
     
    #add song id
    final['id']=df['id'].values
    
    return final

In [8]:
def remove_feat(s):
    if "feat" in s:
        return s[:s.find("feat")-1]
    else:
        return s
spotify_df['name'] = spotify_df['name'].apply(remove_feat)

In [9]:
client_id = '79ca288b68884e198c53146b51adfa1e'
client_secret= '63242b064ebb4e379bfca173dd482e79'
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [15]:
# spotify_df = pd.read_feather("spotify_df.feather")
# spotify_df_low = pd.read_feather("spotify_df_low.feather")

# spotify_df_low.head()

complete_feature_set_low.shape

(688, 377)

In [11]:
complete_feature_set = create_feature_set(spotify_df, float_cols=float_cols)#.mean(axis = 0)
spotify_df_low = spotify_df.drop(spotify_df[spotify_df['popularity'] < 70].index)
complete_feature_set_low = create_feature_set(spotify_df_low, float_cols=float_cols)

In [12]:
spotify_df_low['url'] = spotify_df_low['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])
spotify_df_low = spotify_df_low[['artists','id','name','artists_upd_v1', 'artists_song', 'url']]

In [13]:
spotify_df.to_feather('spotify_df.feather')
spotify_df_low.reset_index().to_feather('spotify_df_low.feather')
complete_feature_set.to_feather('complete_feature_set.feather')

In [16]:
def compress_feature_set(complete_feature_set_low):
    e = []
    f = []
    
    # dividing df into only genre column and except genre column
    for i in complete_feature_set_low.columns:
        if i.split('|')[0]!='genre':
            e.append(i)
        else:
            f.append(i)
    cfs = complete_feature_set_low.drop(e,axis=1)
    cfs_other = complete_feature_set_low.drop(f,axis=1)
    new_cfs = pd.DataFrame()

    # making columns with every word separate and adding the similar accurance columns
    for i in cfs.columns:
        tmp = i.split('|')[1].split('_')
        for j in tmp:    
            try:
                new_cfs[j] = new_cfs[j] + complete_feature_set_low[i]
            except:
                new_cfs[j] = complete_feature_set_low[i]

    # dropping columns with low value count
    tmp = list(complete_feature_set_low.columns)
    x = []
    l = []
    c = ''
    for i in tmp:
        c = i.split('|')[0]
        if c=='genre':
            i = i.split('|')[1]
            l = i.split('_')
            for j in l:
                x.append(j)        

    new_cfs = new_cfs.drop(list(pd.Series(x).value_counts().index)[100:], axis=1)
                
    cfs_final = pd.concat([new_cfs,cfs_other],axis=1)
    return cfs_final

In [17]:
cfs_final = compress_feature_set(complete_feature_set_low)

cfs_final.to_feather('cfs_final.feather')

In [16]:
# spotify_df[spotify_df['id'] == '1i1fxkWeaMmKEB4T7zqbzK']['artists_upd_v1'].iloc[0]

In [17]:
# id_list= ['7qiZfU4dY1lWllzX7mPBI3','1i1fxkWeaMmKEB4T7zqbzK','0e7ipj03S05BNilyu5bRzt','0VjIjW4GlUZAMYd2vXMi3b','2Fxmhks0bxGSBdJ92vM42m','0TK2YIli7K1leLovkQiNik','3KkXRkHbMCARz0aVfEt68P','1rfofaqEpACxVEHIZBJe6W','0pqnGHJpmpxLKifKRmU6WP']

In [24]:
# spotify_df.sort_values("popularity",ascending=False)[['name','artists','popularity']].head(30)

In [19]:
cfs_final.shape

(688, 163)