In [26]:
import time
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
base_url = "https://www.billboard.com/charts/hot-100/"

In [28]:
# calculate last day of the month
def get_last_day_of_month(year, month):
    last_days = [31, 30, 29, 28, 27] # options for last day
    for i in last_days:
        try:
            end = datetime(year, month, i)
        except ValueError:
            continue
        else:
            return end.date()

In [32]:
# get the billboards url for the last week of a specific month
def get_top_100_url(year, month):
    last_day = get_last_day_of_month(year, month)
    url = f"https://www.billboard.com/charts/hot-100/{last_day}/" # find the url for the last week of the month (last day input works great)
    return url

In [33]:
# out of the 100, use html parsing to find the top 10 songs in particular
def get_top_10_songs(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        song_entries = soup.find_all('ul', class_='o-chart-results-list-row') # specific tag we need to find
        top_10_songs = [] # store all the data found in a list for later reference

        for entry in song_entries[:10]: # go through all of these items
            title_tag = entry.find('h3', id='title-of-a-story') # find the h3 tag in the data (contains title of songs)
            # find the span tag in the data (contains artists)
            artist_tags = entry.find_all('span', class_='c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only')
            song_title = title_tag.text.strip() # formatting of string
            artists = ' & '.join([artist.text.strip() for artist in artist_tags]) # join together all artists but separate by "&" to distinguish separate names
            song_info = song_title + artists # combine to give our query
            song_info = song_info.replace("Featuring","") # delete filler words that may be found in the string
            top_10_songs.append(song_info) # add this to our list

        return top_10_songs

    else:
        print(f"Error fetching {url}: Status code {response.status_code}") # if the url is not found (no issues!)
        return []

In [34]:
# now we want to get access the appropriate url from songdata.io
def get_songdata_url(song):
    search_url = f"https://songdata.io/search?query={'+'.join(song.split())}" # let us split our query string into words that are joined by "+" (noted from url of search queries)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(search_url, headers=headers) # grab response

    if response.status_code == 200: # if data found!
        soup = BeautifulSoup(response.text, 'html.parser') # parse
        first_td = soup.find('td', class_='table_name') # find the first table with this class
        if first_td:
            link_tag = first_td.find('a') # then find the first <a></a> tag within the <td></td> tags
            if link_tag:
                song_url = 'https://songdata.io' + link_tag.get('href') # then lets concactenate to find the full url
                return song_url

    print(f"No Songdata URL found for {song}") # if the url is not found
    return None

In [35]:
# get the audio features locations in the songdata.io url for each song (runs for input feature to avoid repetition of code for each separate feature)

def get_feature_value(soup, feature_name):
    time.sleep(0.2) # extensive rate-limiting while scraping normally so need to pause execution (0.2 minimum value helped us reach 1 hour runtime)
    if feature_name == "BPM": # BPM's are in a unique location
      bpm_dl = soup.find('dl', class_='card grid my-1 py-1') # find <dl> tag
      if bpm_dl:
          bpm_dd = bpm_dl.find('dd', style='font-size:1.5rem;font-weight:600;margin:0') # find nesting <dd> tag
          if bpm_dd:
              bpm_val = bpm_dd.text.strip() # format string appropriately
              return float(bpm_val) # return as a float to allow average calculations later on

      print("BPM not found in song.") # if bpm not found (no issues!)
      return None

    feature_dt = soup.find('dt', string=feature_name) # for all other features find <dt> tag
    if feature_dt:
        feature_dd = feature_dt.find_next_sibling('dd') # find nested <dd> tag
        if feature_dd:
            feature_val = feature_dd.text.strip() # format string appropriately
            return float(feature_val.replace('%', '').strip()) # remove character "%" since we want float value

    print(f"{feature_name} not found in song.") # if feature is not found
    return None

In [36]:
# no call to find the individual feature values
def get_features_for_song(song_url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(song_url, headers=headers)

    if response.status_code == 200: # if response found
        soup = BeautifulSoup(response.text, 'html.parser') # parse!
        features = { # create dictionary to store all feature data
            "Acousticness": get_feature_value(soup, "Acousticness"), # for this feature, call get_feature_value to get value
            "Danceability": get_feature_value(soup, "Danceability"), # do the same for each feature
            "Energy": get_feature_value(soup, "Energy"),
            "Instrumentalness": get_feature_value(soup, "Instrumentalness"),
            "BPM": get_feature_value(soup, "BPM")
        }
        return features

    print(f"Error fetching features for {song_url}") # if the url is not found
    return None

In [37]:
# overarching function to run for selected time range (2010 to 2023)
def collect_data(start_year, end_year):
    audio_data = {}
    for year in range(start_year, end_year + 1): # need to add 1 to end_year to run through 2023 data
        for month in range(1, 13): # run through all 12 months
            last_day = get_last_day_of_month(year, month)
            month_str = last_day.strftime('%Y-%m-%d') # format string

            top_100_url = get_top_100_url(year, month) # get our top 100 per week

            top_10_songs = get_top_10_songs(top_100_url) # get our top 10 per week

            song_data_urls = [] # store all urls here!

            for song in top_10_songs: # per song
                song_url = get_songdata_url(song) # get our songdata.io url
                if song_url:
                    song_data_urls.append(song_url) # store this url
                else:
                    song_data_urls.append(None)

            features_data = [] # store all feature data here!
            for song_url in song_data_urls: # go through all urls
                if song_url:
                    features = get_features_for_song(song_url) # find feature data for each song
                    if features:
                        features_data.append(features) # store

            if features_data:

                avg_features = {} # average our features!
                for feature in features_data[0].keys(): # lets go through the keys for each index in features_data
                    avg_features[feature] = (sum([f[feature] for f in features_data]) / len(features_data))/100 # then find the average of these features

                audio_data[month_str] = avg_features # add this information to our overarching audio_data dictionary
                print(f"Average features for {month_str}: {avg_features}") # print for runtime updates
            else:
                audio_data[month_str] = {feature: None for feature in ["Danceability", "Energy", "Loudness", "Tempo", "Acousticness"]} # otherwise input None

    return audio_data

def create_dataframe(start_year, end_year): # now create our dataframe (using Pandas) for easy plotting!
    audio_data = collect_data(start_year, end_year) # input our start and end years
    df = pd.DataFrame(audio_data).T # transpose to get features as column values
    return df

df = create_dataframe(2010, 2023) # call

Average features for 2010-01-31: {'Acousticness': 0.198, 'Danceability': 0.7390000000000001, 'Energy': 0.71, 'Instrumentalness': 0.086, 'BPM': 1.23}
Average features for 2010-02-28: {'Acousticness': 0.13, 'Danceability': 0.794, 'Energy': 0.57, 'Instrumentalness': 0.0, 'BPM': 1.09}
Average features for 2010-03-31: {'Acousticness': 0.175, 'Danceability': 0.73, 'Energy': 0.7390000000000001, 'Instrumentalness': 0.085, 'BPM': 1.213}
Average features for 2010-04-30: {'Acousticness': 0.14300000000000002, 'Danceability': 0.727, 'Energy': 0.7140000000000001, 'Instrumentalness': 0.0, 'BPM': 1.182}
Average features for 2010-05-31: {'Acousticness': 0.17300000000000001, 'Danceability': 0.685, 'Energy': 0.779, 'Instrumentalness': 0.09699999999999999, 'BPM': 1.1340000000000001}
Average features for 2010-06-30: {'Acousticness': 0.187, 'Danceability': 0.7240000000000001, 'Energy': 0.752, 'Instrumentalness': 0.01, 'BPM': 1.031}
Average features for 2010-07-31: {'Acousticness': 0.14800000000000002, 'Danc

In [41]:
df.head() # let's check format

Unnamed: 0,Acousticness,Danceability,Energy,Instrumentalness,BPM
2010-01-31,0.198,0.739,0.71,0.086,1.23
2010-02-28,0.13,0.794,0.57,0.0,1.09
2010-03-31,0.175,0.73,0.739,0.085,1.213
2010-04-30,0.143,0.727,0.714,0.0,1.182
2010-05-31,0.173,0.685,0.779,0.097,1.134


In [45]:
import plotly.express as px

# Assuming df is your DataFrame containing the audio features data
# Plotting each feature over time
df.reset_index(inplace=True)  # Reset index to use 'index' as a column for months

# Create the line plot
fig = px.line(df, x='index', y=df.columns[1:],  # 'index' is the month_str (x-axis), and features as y-axis
              labels={'index': 'Month', 'value': 'Feature Value (in decimal)'},  # Axis labels
              title="Audio Features Over Time (2010-2023)",
              markers=True)

# Show the plot
fig.show()