# Collecting Billboard Hot 100s and Spotify URIs

The Python package billboard.py doesn't allow extraction of year-end Hot 100 lists and Billboard now requires a pro subscription for their Hot 100 Year End lists. To bypass this, I used Beautiful Soup to scrape each Billboard Hot 100 between 2000 and 2024 from Wikipedia then manually filled in the few missing values due to the unique HTML structure of the charts.  

## Imports

In [1]:
'''
!pip install billboard.py
!pip install spotipy
!pip install lyricsgenius
!pip install tqdm
'''

'\n!pip install billboard.py\n!pip install spotipy\n!pip install lyricsgenius\n!pip install tqdm\n'

In [2]:
import numpy as np
import pandas as pd

# billboard rankings
import billboard

# for web scraping the billboard rankings
import requests
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm
import warnings

# for genius collaborators
import lyricsgenius

## Billboard Hot 100

In [3]:
# define the range
years = range(2000, 2025)
# create an empty list for the songs
bb_100 = []

# set headers to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/115.0.0.0 Safari/537.36'
}

# loop through each year's Wikipedia page
for year in years:
    # base url
    url = f'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}'
    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f'Failed {year}: Status {response.status_code}')
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': 'wikitable'})
        if not table:
            print(f'No table found for {year}')
            continue

        rows = table.find_all('tr')
        year_songs = 0

        for row in rows[1:]:  # skip header
            cols = row.find_all(['td', 'th'])
            if len(cols) < 3:
                continue

            # get text from any nested tags
            rank = ' '.join([x.get_text(strip=True) for x in cols[0].find_all(string=True)]).strip()
            title = ' '.join([x.get_text(strip=True) for x in cols[1].find_all(string=True)]).strip()
            artist = ' '.join([x.get_text(strip=True) for x in cols[2].find_all(string=True)]).strip()

            if not rank or not title or not artist:
                continue

            bb_100.append({
                'year': year,
                'rank': rank,
                'title': title,
                'artist': artist
            })
            year_songs += 1

        # if fewer than 100 songs scraped, warn
        if year_songs < 100:
            print(f'Warning: {year_songs} songs scraped for {year} (expected 100)')

        print(f'Loaded {year} ({year_songs} songs)')
        time.sleep(1)  # polite delay

    except Exception as e:
        print(f'Failed {year}: {e}')

# convert to DataFrame
bb_100 = pd.DataFrame(bb_100)

# clean up duplicates and whitespace
bb_100['title'] = bb_100['title'].str.strip()
bb_100['artist'] = bb_100['artist'].str.strip()

Loaded 2000 (99 songs)
Loaded 2001 (100 songs)
Loaded 2002 (100 songs)
Loaded 2003 (100 songs)
Loaded 2004 (100 songs)
Loaded 2005 (100 songs)
Loaded 2006 (100 songs)
Loaded 2007 (100 songs)
Loaded 2008 (98 songs)
Loaded 2009 (100 songs)
Loaded 2010 (100 songs)
Loaded 2011 (100 songs)
Loaded 2012 (99 songs)
Loaded 2013 (99 songs)
Loaded 2014 (100 songs)
Loaded 2015 (99 songs)
Loaded 2016 (98 songs)
Loaded 2017 (100 songs)
Loaded 2018 (100 songs)
Loaded 2019 (100 songs)
Loaded 2020 (100 songs)
Loaded 2021 (100 songs)
Loaded 2022 (100 songs)
Loaded 2023 (100 songs)
Loaded 2024 (99 songs)


### Find Missing

There were a total of 9 missing tracks. I manually located these on the respective Wikipedia pages and added them to the dataframe. 

In [4]:
# set the rank column to numeric
bb_100['rank'] = pd.to_numeric(bb_100['rank'], errors='coerce')

# loop through each year
for year in bb_100['year'].unique():
    ranks = bb_100[bb_100['year'] == year]['rank'].dropna().astype(int)
    missing = set(range(1, 101)) - set(ranks)
    if missing:
        print(f'Year {year} is missing ranks: {sorted(missing)}')

Year 2000 is missing ranks: [23]
Year 2008 is missing ranks: [10, 17]
Year 2012 is missing ranks: [17]
Year 2013 is missing ranks: [18]
Year 2015 is missing ranks: [10]
Year 2016 is missing ranks: [2, 21]
Year 2024 is missing ranks: [20]


In [5]:
# create a dictionary of the missing years and associated song information
bb_missing = {'year': [2000,2008,2008,2012,2013,2015,2016,2016,2024]
            , 'rank': [23,10,17,17,18,10,2,21,20]
            , 'title': ['I Need to Know','Forever','Don\'t Stop the Music','Whistle','Wrecking Ball','The Hills'
                        ,'Sory','Heathens','Snooze']
            , 'artist': ['Marc Anthony','Chris Brown','Rihanna','Flo Rida','Miley Cyrus','The Weekend'
                         ,'Justin Bieber','Twenty One Pilots','SZA'] }

# make into dataframe
bb_missing = pd.DataFrame(bb_missing)

In [6]:
# combine the two dataframes
bb_all = pd.concat([bb_100, bb_missing], ignore_index=True)

print(bb_all.shape)   # new row count

(2500, 4)


In [7]:
# save to csv
bb_all.to_csv('bb_all.csv', index=False)

### Get the 2025 Recent Ranking

In [8]:
chart = billboard.ChartData('hot-100')  # most recent ranking

bb_2025 = []
for song in chart:
    bb_2025.append({
        'Rank': song.rank,
        'Title': song.title,
        'Artist': song.artist,
        'Last Week': song.lastPos,
        'Peak Position': song.peakPos,
        'Weeks on Chart': song.weeks
    })

bb_2025 = pd.DataFrame(bb_2025)
bb_2025.to_csv('bb_2025.csv', index=False)