In [10]:
import pandas as pd

import re
import json
from tqdm import tqdm

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [3]:
# load raw_data
path = './data/raw_data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,name
0,0,0,AronChupa,spotify:track:66U0ASk1VHZsqIkpMjKX3B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,Little Swing,spotify:album:4S5MLjwRSi0NJ5nikflYnZ,163809,Little Swing,Party
1,1,1,AronChupa,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,I'm an Albatraoz,spotify:album:1qHVYbxQ6IS8YRviorKDJI,166848,I'm an Albatraoz,Party
2,2,2,Lorde,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,Yellow Flicker Beat - From The Hunger Games: M...,spotify:album:4UEPxQx0cTcYNsE0n32MHV,232506,Yellow Flicker Beat,Party
3,3,3,Lorde,spotify:track:35kahykNu00FPysz3C2euR,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,White Teeth Teens,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,216600,Pure Heroine,Party
4,4,4,Lorde,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,Team,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,193058,Pure Heroine,Party


In [6]:
# format track-uri
# match the last word-like sequence of characters from string
df['track_uri'] = df['track_uri'].apply(lambda x: re.findall(r'\w+$', x)[0])
df['track_uri']

0         66U0ASk1VHZsqIkpMjKX3B
1         5MhsZlmKJG6X5kTHkdwC4B
2         0GZoB8h0kqXn7XFm4Sj06k
3         35kahykNu00FPysz3C2euR
4         3G6hD9B2ZHOsgf4WfNu7X1
                   ...          
280995    38griAVM808crjbFp9gcPD
280996    1JClFT74TYSXlzpagbmj0S
280997    4InLm5a9Qtkru6YxEjM4Qc
280998    4hdog9vyyqG9pcppG2Izek
280999    0NiXXAI876aGImAd6rTj8w
Name: track_uri, Length: 281000, dtype: object

In [11]:
try:
    with open('config.json', 'r') as file:
        config_data = json.load(file)
        client_id = config_data['client_id']
        client_secret = config_data['client_secret']
except FileNotFoundError:
    print('Config file not found.')
except json.JSONDecodeError:
    print('Error decoding JSON in the config file.')

# authentication without user
client_credentials_manager = SpotifyClientCredentials(
    client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [17]:
def extract_track_features(uri):
    # audio features
    features = sp.audio_features(uri)[0]

    # artist's genres and popularity
    artist = sp.track(uri)['artists'][0]['id']
    artist_genres = sp.artist(artist)['genres']
    artist_popularity = sp.artist(artist)['popularity']

    # track popularity
    track_popularity = sp.track(uri)['popularity']

    # add extra features
    features['artist_popularity'] = artist_popularity
    if artist_genres:
        features['genres'] = " ".join(
            [re.sub(' ', '_', i) for i in artist_genres])   # replace space with underscore
    else:
        features['genres'] = 'unknown'
    features['track_popularity'] = track_popularity

    return features

In [16]:
# scared so only doing the first 30000
first_part = df['track_uri'].unique()[:10000]
second_part = df['track_uri'].unique()[10000:20000]
third_part = df['track_uri'].unique()[20000:30000]

features = list()