In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin_min
from yellowbrick.cluster import KElbowVisualizer

In [None]:
# Find data on the internet about currently popular songs
# Billboard weekly Top 100 of "hot" songs

def get_billboard_top100():
    # Getting the html code of the web page
    r = requests.get('https://www.billboard.com/charts/hot-100')

    # Parsing the html code
    soup = BeautifulSoup(r.content, 'html.parser')

    # Find all the chart results rows
    chart_entries = soup.find_all('li', class_="lrv-u-width-100p")

    # Initialize lists to store the song and artist names
    songs = []
    artists = []

    # Extract song titles
    for chart in chart_entries:
        song = chart.find('h3')
        if song is not None:
            songs.append(song.get_text(strip=True))

    # Extract artists
    for i in range(0, len(chart_entries), 2):
        chart = chart_entries[i]
        artist = chart.find('span')
        artists.append(artist.get_text(strip=True))

    # Create a pandas dataframe with the song and artist data
    data = {'Song': songs, 'Artist': artists}
    df_billboard = pd.DataFrame(data)

    return df_billboard

In [None]:
df_billboard = get_billboard_top100()
df_billboard

In [None]:
# Access spotify to get own songs

# Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="ac7f2fffdc1f4ec1b5ff310f8f0aae87",
                                                           client_secret="80973cbe48144c8391a7a013a38a5936"))

In [None]:
# List to store track features
song_data = []

In [None]:
# Collect artists from the playlist “Top Hits 2023”
playlist = sp.user_playlist_tracks("spotify", "5SlUEUKY4HempRqVNnosXx")
tracks = playlist['items']
    
while playlist['next']:
    playlist = sp.next(playlist)
    tracks.extend(playlist['items'])
        
artist_ids = [artist_info['id'] for track in tracks for artist_info in track['track']['artists']]
artist_ids

In [None]:
len(artist_ids)

In [None]:
# Iterate over the artists and collect all the tracks from the artists
for artist_id in artist_ids:
    albums = sp.artist_albums(artist_id, album_type='album')
    for album in albums['items']:
        album_tracks = sp.album_tracks(album['id'])
        for album_track in album_tracks['items']:
            track_id = album_track['id']
            track_name = album_track['name']
            artist_name = album_track['artists'][0]['name']
            
            # Retrieve audio features for the tracks
            audio_features = sp.audio_features(track_id)
            if audio_features and len(audio_features) > 0:
                features = audio_features[0]
                if features is not None:
                    feature_data = {
                        'track': track_name,
                        'artist': artist_name,
                        'track_id': track_id,
                        'danceability': features.get('danceability'),
                        'energy': features.get('energy'),
                        'key': features.get('key'),
                        'loudness': features.get('loudness'),
                        'mode': features.get('mode'),
                        'speechiness': features.get('speechiness'),
                        'acousticness': features.get('acousticness'),
                        'instrumentalness': features.get('instrumentalness'),
                        'liveness': features.get('liveness'),
                        'valence': features.get('valence'),
                        'tempo': features.get('tempo')
                    }
                    song_data.append(feature_data)
song_data

In [None]:
# Create a DataFrame from the collected song data
df = pd.DataFrame(song_data)

# Drop duplicates and reset index
df.drop_duplicates(inplace=True)
df = df.reset_index(drop=True)

# Save df to a csv file
df.to_csv('song_data.csv', index=False)
df

In [None]:
# # Load song_data
# df = pd.read_csv('song_data.csv')
# df

In [None]:
# Select the features that we need
x = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

# Standarize the data
scaler = StandardScaler()
x_prep = scaler.fit_transform(x)

In [None]:
# Choosing best K
# Check elbow with YellowBrick

# Instantiate the clustering model and visualizer
visualizer = KElbowVisualizer(KMeans(n_clusters=3, random_state=42), k=(2,20))

visualizer.fit(x_prep)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data
K = range(2, 20)
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(x_prep)
    inertia.append(kmeans.inertia_)

inertia

In [None]:
# Train and predict with K=8
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(x_prep)

In [None]:
# Create new dataframe with title, artist and cluster assigned
scaled_df = pd.DataFrame(x_prep, columns=x.columns)
scaled_df['track'] = df['track']
scaled_df['artist'] = df['artist']
scaled_df['cluster'] = clusters
scaled_df

In [None]:
# Analyze results, we can start to see some winners
scaled_df.groupby(['cluster', 'artist'], as_index=False).count().sort_values(['cluster', 'key'], ascending=[True, False])[['artist', 'cluster', 'key']].reset_index(drop=True)

In [None]:
# Toxic - Britney Spears
# Yellow Submarine - The Beatles
# Stairway to Heaven - Led Zeppelin
# Wonderwall - Oasis
# It's my life - Bon Jovi
# Californication - Red Hot Chilli Peppers
# Helena - My Chemical Romance

song_name = input('Choose a song: ')

# Search for the song in spotipy and get the audio features
results = sp.search(q=f'track:{song_name}', limit=1)
track_id = results['tracks']['items'][0]['id']
audio_features = sp.audio_features(track_id)

# Converting it into a dataframe
df_ = pd.DataFrame(audio_features)
new_features = df_[x.columns]

# Scaling the audio features
scaled_x = scaler.transform(new_features)

# Assign the cluster to the song
cluster = kmeans.predict(scaled_x)

# Take another close song in the same cluster to recommend
filtered_df = scaled_df[scaled_df['cluster'] == cluster[0]][x.columns]
closest, _ = pairwise_distances_argmin_min(scaled_x, filtered_df)
scaled_df.loc[closest[0]]['track'], scaled_df.loc[closest[0]]['artist']


In [None]:
scaled_df[scaled_df.track == 'epiphany']

#### Next Steps:

In [None]:
def recommend_song():
    # Get song id
    song_name = input('Choose a song: ')
    
    # Check if the input song is in df_billboard
    if song_name in df_billboard['Song'].values:
        # Recommend a song from the billboard DataFrame
        recommended_song = df_billboard.sample(1)
        print('\n[RECOMMENDED SONG]')
        return f"{recommended_song['Song'].values[0]} - {recommended_song['Artist'].values[0]}"
    else:
        results = sp.search(q=f'track:{song_name}', limit=1)
        track_id = results['tracks']['items'][0]['id']
        
        # Get song features with the obtained id
        audio_features = sp.audio_features(track_id)
        
        # Create dataframe
        df_ = pd.DataFrame(audio_features)
        new_features = df_[x.columns]
        
        # Scale features
        scaled_x = scaler.transform(new_features)
        
        # Predict cluster
        cluster = kmeans.predict(scaled_x)
        
        # Filter dataset to predicted cluster
        filtered_df = scaled_df[scaled_df['cluster'] == cluster[0]][x.columns]
        
        # Get closest song from filtered dataset
        closest, _ = pairwise_distances_argmin_min(scaled_x, filtered_df)
        
        # Return it in a readable way
        print('\n[RECOMMENDED SONG]')
        return f"{scaled_df.loc[closest]['track'].values[0]} - {scaled_df.loc[closest]['artist'].values[0]}"


In [None]:
recommend_song()