In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin_min

In [2]:
# Find data on the internet about currently popular songs
# Billboard weekly Top 100 of "hot" songs

def get_billboard_top100():
    # Getting the html code of the web page
    r = requests.get('https://www.billboard.com/charts/hot-100')

    # Parsing the html code
    soup = BeautifulSoup(r.content, 'html.parser')

    # Find all the chart results rows
    chart_entries = soup.find_all('li', class_="lrv-u-width-100p")

    # Initialize lists to store the song and artist names
    songs = []
    artists = []

    # Extract song titles
    for chart in chart_entries:
        song = chart.find('h3')
        if song is not None:
            songs.append(song.get_text(strip=True))

    # Extract artists
    for i in range(0, len(chart_entries), 2):
        chart = chart_entries[i]
        artist = chart.find('span')
        artists.append(artist.get_text(strip=True))

    # Create a pandas dataframe with the song and artist data
    data = {'Song': songs, 'Artist': artists}
    df_billboard = pd.DataFrame(data)

    return df_billboard

In [3]:
get_billboard_top100()

Unnamed: 0,Song,Artist
0,Last Night,Morgan Wallen
1,Fast Car,Luke Combs
2,Calm Down,Rema & Selena Gomez
3,Flowers,Miley Cyrus
4,All My Life,Lil Durk Featuring J. Cole
...,...,...
95,"Angel, Pt. 1","Kodak Black, NLE Choppa, Jimin, JVKE & Muni Long"
96,Girl In Mine,Parmalee
97,Moonlight,Kali Uchis
98,Classy 101,Feid x Young Miko


In [7]:
# Access spotify to get own songs

# Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="357f579faa3d4d9d9b11102c1c03a0d3", client_secret="b9efb6a82d7841de8c86f8a01cb57515"))

# List to store track features
song_data = []

# Collect artists ids from the playlist “Hot Hits 2023”
playlist = sp.user_playlist_tracks("spotify", "19gL703KNO8dMs7wMg9ZLX")
tracks = playlist['items']
    
while playlist['next']:
    playlist = sp.next(playlist)
    tracks.extend(playlist['items'])
        
artists_ids = [artist_info['id'] for track in tracks for artist_info in track['track']['artists']]
artists_ids

Enter your client ID: 357f579faa3d4d9d9b11102c1c03a0d3
Enter your client secret: b9efb6a82d7841de8c86f8a01cb57515


['2wY79sveU1sp5g7SokKOiI',
 '3Xt3RrJMFv5SZkCfUE8C1J',
 '4bL2B6hmLlMWnUEZnorEtG',
 '4e0nWw2r4BoQSKPQ2zpU13',
 '4yvcSjfu4PC0CYQyLy4wSq',
 '4NSzuIc0eGOftqr0tEOhJk',
 '1Xyo4u8uXC1ZmMpatF05PJ',
 '53XhwfbYqKCa1cC15pYq2q',
 '0u6GtibW46tFX7koQ6uNJZ',
 '28y6CyJNkGNjJQKrlx4AmN',
 '5Pwc4xIPtQLFEnJriah9YJ',
 '43BxCL6t4c73BQnIJtry5v',
 '45ruzGUmIr8WLjLOPJ9mGU',
 '0hCNtLu0JehylgoiP8L4Gh',
 '0u6GtibW46tFX7koQ6uNJZ',
 '5Wg2b4Mp42gicxEeDNawf7',
 '5gIKdggxXtjoJ8jsHv2ypp',
 '7ACEUD7UsmmXrnj4OLt8f9',
 '7KAGJwWQQui8b0uqwXRkSr',
 '5SFwozJTdBZNUFYYrAqH4q',
 '246dkjvS1zLTtiykXe5h60',
 '5cj0lLjcoR7YOSnhnX0Po5',
 '4PUr4W5mWlzMkyVpSiX2ZN',
 '3PhoLpVuITZKcymswpck5b',
 '6M2wZ9GZgrQXHCFfjv46we',
 '6n28c9qs9hNGriNa72b26u',
 '3A9B6c1CrSPauiOblw7pWz',
 '4pnp4w9g30yLfVIAFnZMRd',
 '1c70yCa8sRgIiQxl3HOEFo',
 '6qqNVTkY8uBg9cP3Jd7DAH',
 '7GZJ2POiwPZoW7UVYjNj8i',
 '28y5ZcfpdZAfeEE5ftCfUg',
 '53XhwfbYqKCa1cC15pYq2q',
 '1dMdLlZI5X74N3yhF5qgEJ',
 '0CNyacW4B30MKOqqDwuvIG',
 '1nh6COzuoesMf7VqKgJWTl',
 '4GNC7GD6oZMSxPGyXy4MNB',
 

In [8]:
# Iterate over the artists and collect all the tracks from the artists
for artist_id in artists_ids:
    albums = sp.artist_albums(artist_id, album_type='album')
    for album in albums['items']:
        album_tracks = sp.album_tracks(album['id'])
        for album_track in album_tracks['items']:
            track_id = album_track['id']
            track_name = album_track['name']
            artist_name = album_track['artists'][0]['name']
            
            # Retrieve audio features for the tracks
            audio_features = sp.audio_features(track_id)
            if audio_features and len(audio_features) > 0:
                features = audio_features[0]
                if features is not None:
                    feature_data = {
                        'track_name': track_name,
                        'artist_name': artist_name,
                        'track_id': track_id,
                        'danceability': features.get('danceability'),
                        'energy': features.get('energy'),
                        'speechiness': features.get('speechiness'),
                        'acousticness': features.get('acousticness'),
                        'instrumentalness': features.get('instrumentalness'),
                        'liveness': features.get('liveness'),
                        'valence': features.get('valence'),
                        'tempo': features.get('tempo')
                    }
                    song_data.append(feature_data)
song_data

Max Retries reached


SpotifyException: http status: 429, code:-1 - /v1/audio-features/?ids=4drTfZwGTDNrbg3dQ17d0w:
 Max Retries, reason: too many 429 error responses

In [None]:
# Create a function to iterate over the artists and collect all the tracks from the artists
def get_audio_features(artist):
    
    # Get tracks from artist
    results = sp.search(q=f'artist:{artist}', limit=50)
    
    # Extract the track ids
    track_ids = [track['id'] for track in results['tracks']['items']]
    song_names = [track['name'] for track in results['tracks']['items']]
    
    # Extract the audio features
    audio_features = sp.audio_features(track_ids)
   
    # Store audio features in a dataframe
    df = pd.DataFrame(audio_features)
    df['artist'] = artist
    df['song_name'] = song_names
    return df

In [None]:
# Create loop and select artists you want to get songs from
df = pd.DataFrame()

for artist in artists:
    df_artist = get_audio_features(artist)
    df = pd.concat([df, df_artist])
    
df = df.reset_index(drop=True)

In [None]:
df.artist.value_counts()

In [None]:
# select the features that we need
x = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

# standarize the data
scaler = StandardScaler()
x_prep = scaler.fit_transform(x)

# train and predict
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(x_prep)
clusters = kmeans.predict(x_prep)

#create new dataframe with title, artist and cluster assigned
scaled_df = pd.DataFrame(x_prep, columns=x.columns)
scaled_df['song_name'] = df['song_name']
scaled_df['artist'] = df['artist']
scaled_df['cluster'] = clusters
scaled_df

In [None]:
# analyzing our results, we can start to see some winners
scaled_df.groupby(['cluster', 'artist'], as_index=False).count().sort_values(['cluster', 'key'], ascending=[True, False])[['artist', 'cluster', 'key']].reset_index(drop=True)

Check out how to visualize your clusters [here](https://towardsdatascience.com/how-to-build-an-amazing-music-recommendation-system-4cce2719a572).

In [None]:
# Toxic - Britney Spears
# Yellow Submarine - The Beatles
# Stairway to Heaven - Led Zeppelin
# Wonderwall - Oasis
# It's my life - Bon Jovi
# Californication - Red Hot Chilli Peppers
# Helena - My Chemical Romance

song_name = input('Choose a song: ')

# Searching for the song in spotipy and getting the audio features
results = sp.search(q=f'track:{song_name}', limit=1)
track_id = results['tracks']['items'][0]['id']
audio_features = sp.audio_features(track_id)

#Converting it into a dataframe
df_ = pd.DataFrame(audio_features)
new_features = df_[x.columns]

# Scaling the audio features
scaled_x = scaler.transform(new_features)

# I assign the cluster to the song
cluster = kmeans.predict(scaled_x)

# I take another close song in the same cluster to recommend
filtered_df = scaled_df[scaled_df['cluster'] == cluster[0]][x.columns]
closest, _ = pairwise_distances_argmin_min(scaled_x, filtered_df)
scaled_df.loc[closest[0]]['song_name'], scaled_df.loc[closest[0]]['artist']

In [None]:
# scaled_x

In [None]:
scaled_df[scaled_df.song_name == 'Love Yourself']

#### Next Steps:

In [None]:
# put everything inside a function
def recommend_song():
    # get song id
    song_name = input('Choose a song: ')
    results = sp.search(q=f'track:{song_name}', limit=1)
    track_id = results['tracks']['items'][0]['id']
    # get song features with the obtained id
    audio_features = sp.audio_features(track_id)
    # create dataframe
    df_ = pd.DataFrame(audio_features)
    new_features = df_[x.columns]
    # scale features
    scaled_x = scaler.transform(new_features)
    # predict cluster
    cluster = kmeans.predict(scaled_x)
    # filter dataset to predicted cluster
    filtered_df = scaled_df[scaled_df['cluster'] == cluster[0]][x.columns]
    # get closest song from filtered dataset
    closest, _ = pairwise_distances_argmin_min(scaled_x, filtered_df)
    # return it in a readable way
    print('\n [RECOMMENDED SONG]')
    return ' - '.join([scaled_df.loc[closest]['song_name'].values[0], scaled_df.loc[closest]['artist'].values[0]])

In [None]:
recommend_song()

In [None]:
### --> SUGGESTIONS: Error handling, but it is a MVP!...