In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime, timezone

"Libraries to help with jupyter notebook usage"
# Increases jupyter notebook display width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from IPython.display import Image # Helps display images in notebook

In [2]:
import sys
"Appends both parent and grandparent dir to current path, to allow importing"
sys.path.append('..')
sys.path.append('../..')

from jupyternotebook_utils import *
from utils import *
from data_processing import *
from spotify_data import *
from spotify_utils import *

In [3]:
spotify_scopes = "playlist-read-private playlist-modify-private playlist-modify-public user-read-recently-played user-top-read user-library-modify"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri="http://127.0.0.1:8000/callback",
                                               scope=spotify_scopes))

selected_columns = ['danceability', 'energy', 'key', 'loudness', 'speechiness',
                   'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                   'duration_ms', 'date_added', 'date_released']

In [4]:
user_playlists = get_user_playlists(sp)

user_playlists

[{'playlist_name': 'COLDPLAY 2024',
  'playlist_uri': 'spotify:playlist:6GmE6FZnc31SRHEfS2tNii',
  'playlist_href': 'https://open.spotify.com/playlist/6GmE6FZnc31SRHEfS2tNii',
  'playlist_track_count': 1},
 {'playlist_name': '//lo-fi beats 🎶🌕',
  'playlist_uri': 'spotify:playlist:3p4ZD7MTwrL4OaiEN1dFfu',
  'playlist_href': 'https://open.spotify.com/playlist/3p4ZD7MTwrL4OaiEN1dFfu',
  'playlist_track_count': 425},
 {'playlist_name': "//vibin' 🎺",
  'playlist_uri': 'spotify:playlist:4YB4UFkYYJhy3iNaHZoGXa',
  'playlist_href': 'https://open.spotify.com/playlist/4YB4UFkYYJhy3iNaHZoGXa',
  'playlist_track_count': 91},
 {'playlist_name': '//productive mario-ing',
  'playlist_uri': 'spotify:playlist:51yNcvKUbFtC6WwJi4KHH5',
  'playlist_href': 'https://open.spotify.com/playlist/51yNcvKUbFtC6WwJi4KHH5',
  'playlist_track_count': 1306},
 {'playlist_name': '//du zi da zi 🏋🏻',
  'playlist_uri': 'spotify:playlist:1f2neve1uJPOCfkahOkdEf',
  'playlist_href': 'https://open.spotify.com/playlist/1f2neve

In [5]:
target_playlist = 'spotify:playlist:1f2neve1uJPOCfkahOkdEf'
        
playlist_audio_features = get_user_playlist_audio_features(sp, target_playlist)

for playlist in user_playlists:
    if playlist['playlist_uri'] == target_playlist:
        target_playlist_name = playlist['playlist_name']
        status = f'Playlist "{target_playlist_name}" audio features succesfully retrieved.'
        print(status)

Playlist "//windin' down  💨🍃" audio features succesfully retrieved.


In [6]:
playlist_details = get_playlist_tracks(sp, target_playlist)['tracks']

status = f'Playlist "{target_playlist_name}" details succesfully retrieved.'
print(status)

Playlist "//windin' down  💨🍃" details succesfully retrieved.


In [7]:
combined_array = []
for audio_feature_entry in playlist_audio_features:
    for track_info_entry in playlist_details:
        if audio_feature_entry['id'] == track_info_entry['track_idx']:
            combined_entry = {**audio_feature_entry, **track_info_entry}
            combined_array.append(combined_entry)
            break

status = f'Combined array containing details and audio features of "{target_playlist_name}" succesfully created.'
print(status)

Combined array containing details and audio features of "//windin' down  💨🍃" succesfully created.


In [8]:
df = pd.DataFrame(combined_array)
df = df[selected_columns]

before_df = df.copy()
after_df = df.copy()

status = f'Multiple copies of df succesfully created.'
print(status)

Multiple copies of df succesfully created.


In [9]:
# Step 1: Min-max normalization for selected features
input_data_ranges = {
    'loudness': (-60, 0),
    'tempo': (0, 250),
    'key': (-1, 11)
}
for feature, (feature_min, feature_max) in input_data_ranges.items():
    after_df[feature] = (after_df[feature] - feature_min) / (feature_max - feature_min)

# Step 2: Keep selected audio features
selected_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo', 'date_added']
filtered_data = after_df[selected_features].copy()

# Step 3: Cube root transformation for selected features
cubed_features = ['speechiness', 'acousticness', 'liveness']
for feature in cubed_features:
    filtered_data[feature] = np.cbrt(filtered_data[feature])

# Step 4: Logarithmic transformation for selected features
log_features = ['energy', 'loudness']
for feature in log_features:
    filtered_data[feature] = np.log1p(filtered_data[feature])

# Step 5: IQR-filtering based on thresholds
lower_thresholds =  {'danceability': 0.16850000000000004, 'energy': 0.14936944969853527, 'loudness': 0.5734535346157168, 
                     'speechiness': 0.11183139278235099, 'acousticness': -0.35285124201451823, 'liveness': 0.20276263879283324, 
                     'valence': -0.24249999999999988, 'tempo': 0.16009000000000007}
upper_thresholds = {'danceability': 1.0525, 'energy': 0.8605850164229211, 'loudness': 0.698995265917933, 
                    'speechiness': 0.6924946680943335, 'acousticness': 1.280417950271011, 'liveness': 0.8754111662234604, 
                    'valence': 1.2494999999999998, 'tempo': 0.7998179999999999}

for feature in selected_features:
    if feature != 'date_added':
        filtered_data[feature] = np.where(
            (filtered_data[feature] >= lower_thresholds[feature]) &
            (filtered_data[feature] <= upper_thresholds[feature]),
            filtered_data[feature],
            np.nan
        )

# Step 6: Min-max Normalization
feature_min = {'danceability': 0.169, 'energy': 0.1501426584297195, 'loudness': 0.5734623376139655, 
               'speechiness': 0.27977873676275317, 'acousticness': 0.0, 'liveness': 0.21074564860592623, 
               'valence': 0.0, 'tempo': 0.164644}

feature_max = {'danceability': 0.991, 'energy': 0.6931471805599453, 'loudness': 0.6983917371326527, 
               'speechiness': 0.692435557262704, 'acousticness': 0.9986648849277057, 'liveness': 0.8750340122833274, 
               'valence': 1.0, 'tempo': 0.799812}

for feature in selected_features:
    if feature in feature_min:
        filtered_data[feature] = (filtered_data[feature] - feature_min[feature]) / (feature_max[feature] - feature_min[feature])
    
# Step 7: Timestamp Normalization
filtered_data['date_added'] = pd.to_datetime(filtered_data['date_added']).dt.tz_localize(None)  # Convert to tz-naive
current_date = datetime.datetime.now()

# Calculate number of days since date_added
filtered_data['days_since_added'] = (current_date - filtered_data['date_added']).dt.days

multipliers = filtered_data['days_since_added'].apply(
    lambda days: 1 if days <= 180 else (1 if days <= 365 else (1 if days <= 1095 else 1))
#     lambda days: 1 if days <= 180 else (0.75 if days <= 365 else (0.5 if days <= 1095 else 0.25))
)

for feature in selected_features:
    if feature != 'date_added':
        filtered_data[feature] *= multipliers
    
# Step 8: Calculate playlist vector
columns_to_drop = ['date_added', 'days_since_added']

filtered_data = filtered_data.drop(columns=columns_to_drop)
playlist_vector = filtered_data.mean()
playlist_vector = (playlist_vector / playlist_vector.sum()).round(4)

playlist_vector = playlist_vector.sort_values(ascending=False)

columns_to_drop = ['date_added']
# Printing Playlist Vector and Playlist Audio Features
average_before = before_df[selected_features].drop(columns=columns_to_drop).mean()
playlist_audio_features_array = average_before.to_numpy().round(4)

print("Playlist Audio Features:")
print(average_before)
print()
print([int(value * 10000) / 10000 for value in playlist_audio_features_array])
print()

average_after = filtered_data.mean()
playlist_audio_features_array = average_after.to_numpy().round(4)

print("Normalized Playlist Audio Features:")
print(average_before)
print()
print([int(value * 10000) / 10000 for value in playlist_audio_features_array])
print()

print("Playlist Vector:")
print(playlist_vector)

Playlist Audio Features:
danceability      0.506853
energy            0.276676
loudness        -12.927265
speechiness       0.066879
acousticness      0.863147
liveness          0.167350
valence           0.287471
tempo           110.181353
dtype: float64

[0.5069, 0.2767, -12.9273, 0.0669, 0.8631, 0.1674, 0.2875, 110.1814]

Normalized Playlist Audio Features:
danceability      0.506853
energy            0.276676
loudness        -12.927265
speechiness       0.066879
acousticness      0.863147
liveness          0.167350
valence           0.287471
tempo           110.181353
dtype: float64

[0.411, 0.1985, 0.2127, 0.1885, 0.9513, 0.465, 0.2875, 0.4347]

Playlist Vector:
acousticness    0.3021
liveness        0.1477
tempo           0.1380
danceability    0.1305
valence         0.0913
loudness        0.0675
energy          0.0630
speechiness     0.0599
dtype: float64


In [10]:
track_uri = 'spotify:track:1LeSp4o3CeNhJz3kCWgi6E'
# play_track_in_browser(track_uri)