# Goal of the Notebook

In this notebook, we query the Spotify API with the tracks in the 1k dataset after cleaning, which will serve us for the content based part.

In [1]:
import pandas as pd
import dask.dataframe as dd
import tarfile
from dask.delayed import delayed
import scipy
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt

from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import accuracy

from sklearn.manifold import TSNE

from sklearn.decomposition import PCA

from sklearn.neighbors import NearestNeighbors

## Load Cleaned Data

In [2]:
famous_tracks = pd.read_csv('data/famous_tracks.csv')

---
## Collecting spotify data from track ids

Setup spotipy client

In [3]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="cf823e53fff345feae087ccdb549f8ff",
                                                           client_secret="9c45e47cbc55491290c90ef0821c7ea3"))



### Function to apply on the whole dataset to get the spotify Track ID as well as the genres of the artist that made the song.

In [4]:
import re
import string
from unidecode import unidecode

In [5]:
def spotify_id_from_info(row, verbose=True, normalizer=None):
    
    artist_name = row["artist-name"]
    track_name  = row["track-name"]
    
    if normalizer:
        artist_name = normalizer(artist_name)
        track_name  = normalizer(track_name)

    search_query = f'artist:{artist_name} track:{track_name}'
    
    try:
        results = sp.search(search_query, limit=1)
        
        track_id  = results['tracks']['items'][0]['id']
        artist_id = results['tracks']['items'][0]['artists'][0]['id']
        
        genres = set(sp.artist(artist_id)['genres'])
        
        return track_id, artist_id, genres
    
    except:
        if verbose:
            print(f'Could not find information for "{search_query}"')
        return None, None, None

#### Applying it to recover the spotify ids

In [6]:
# get ids withour normalizer
# spotify_ids  = famous_tracks.reset_index().apply(spotify_id_from_info,axis=1)

# add back spotify ids to our new dfs
# spot_df = pd.DataFrame(list(spotify_ids), columns=['spotify-track-id', 'spotify-artist-id', 'genres'])

# set back values
# famous_tracks[['spotify-track-id', 'spotify-artist-id', 'genres']]     = spot_df.values

# test_df = famous_tracks

#### Applying it once again, this time removing non ascii character to recover some more spotify ids

In [7]:
# na_df = test_df[test_df['spotify-artist-id'].isna()]

# translator = str.maketrans('', '', string.punctuation)
# normalizer = lambda s : unidecode(re.sub(r'\(([^\)]+)\)', '', s)).translate(translator).strip()

# spotify_id_normalizer = lambda x : spotify_id_from_info(x,normalizer=normalizer)

# spotify_ids_normalized = na_df.reset_index().apply(spotify_id_normalizer, axis=1)

# spot_norm_df = pd.DataFrame(list(spotify_ids_normalized), columns=['spotify-track-id', 'spotify-artist-id', 'genres'])

# na_df[['spotify-track-id', 'spotify-artist-id', 'genres']] = spot_norm_df.values

# test_df = test_df[['artist-name', 'plays', 'track-name', 'musicbrainz-artist-id', 'spotify-id', 'spotify-artist-id', 'genres']]

#### Merging back the lists of spotify ids recovered

In [8]:
# idreducer = lambda x : x['spotify-id'] if x['spotify-id'] else x['spotify-idnorm']
# artistreducer = lambda x : x['spotify-artist-id'] if x['spotify-artist-id'] else x['spotify-artist-idnorm']
# genrereducer = lambda x : x['genres'] if x['genres'] else x['genresnorm']

# test_df['spotify-id']        = test_df.join(na_df, rsuffix="norm")[['spotify-id', 'spotify-idnorm']].apply(idreducer, axis=1)
# test_df['spotify-artist-id'] = test_df.join(na_df, rsuffix="norm")[['spotify-artist-id', 'spotify-artist-idnorm']].apply(artistreducer, axis=1)
# test_df['genres']            = test_df.join(na_df, rsuffix="norm")[['genres', 'genresnorm']].apply(genrereducer, axis=1)

# test_df = test_df[['artist-name', 'plays', 'track-name', 'musicbrainz-artist-id', 'spotify-id', 'spotify-artist-id', 'genres']]
# test_df = test_df.dropna()

### Obtaining the Spotify audio features from the id and joining back into the dataframe

In [9]:
# features = test_df['spotify-id'].apply(lambda x : sp.audio_features([x]))

# features_df = pd.DataFrame([x[0] for x in features.values])
# features_df = features_df.set_index(features.index)

# joint_df = features_df.join(test_df)

#### Saving Audio Features for later

In [10]:
#joint_df.to_pickle('data/features.pkl')

In [11]:
#joint_df.to_csv('data/features.csv')

#### Recovering genres per track, and saving it as well

In [12]:
# test = set()
# def f(x):
#     global test
#     test = test.union(x)
#     return x
# recovered.genres.apply(f)

In [13]:
# recovered = pd.read_pickle('data/features.pkl')
# recovered.head()

## Create User/Track DataSet

In [14]:
#Filter out tnon famous tracks
# df_1kfamous  = df_1k.dropna()[df_1k.dropna()['musicbrainz-track-id'].isin(list(famous_tracks.index))]
#Filter out tracks not in spotify result
# df_1kfamous = df_1kfamous[df_1kfamous['musicbrainz-track-id'].isin(list(recovered.index))]
# df_1kfamous = df_1kfamous.groupby(['userid', 'musicbrainz-artist-id','musicbrainz-track-id']).agg((lambda x : x.iloc[0], lambda x : len(x)))
# df_1kfamous.columns = ['timestamp','plays','artist-name','tnorm1', 'track-name','tnorm2']
# df_1kfamous = df_1kfamous.reset_index().drop(['tnorm1', 'tnorm2'],axis=1)
# df_1kfamous.rename(columns = {'userid':'user-id','musicbrainz-track-id':'track-id'}, inplace = True)
# df_1kfamous

In [15]:
#df_1kfamous.to_csv('data/df_1ktfamous.csv')