## scrape top songs

From `https://kworb.net/spotify`, get the names of most popular artists and their most famous songs.

I'm scraping the top 500, but change it as you like.

*Note*: Change the `DELAY` to a higher number if your requests are getting blocked for making requests too frequently.

In [None]:
import json
import time

import requests
from bs4 import BeautifulSoup

In [None]:
NUM_ARTISTS = 500
DELAY = 0  # to not get blocked from access.

BASE_URL = "https://kworb.net/spotify"
ARTISTS_URL = f"{BASE_URL}/listeners.html"

In [None]:
response = requests.get(ARTISTS_URL)

soup = BeautifulSoup(response.content, "html.parser")

In [None]:
table = soup.find_all("table")[0]
top_rows = table.find_all("tbody")[0].find_all("tr")

In [None]:
table = soup.find_all("table")[0]
top_rows = table.find_all("tbody")[0].find_all("tr")
unfiltered_top_artist_links = [f"{BASE_URL}/{x.find('a')['href']}" for x in top_rows]
unfiltered_top_artist_names = [
    x.find("a").get_text().replace(" ", "_").replace("/", "_") for x in top_rows
]

cur_artist_no = 0

top_artist_links = {}

for top_artist_name, top_artist_link in zip(
    unfiltered_top_artist_names, unfiltered_top_artist_links
):
    # some artists don't have the same table structure and don't have song names under them.
    # skip those
    if top_artist_link.endswith("songs.html"):
        top_artist_links[top_artist_name] = top_artist_link
        cur_artist_no += 1

    if cur_artist_no == NUM_ARTISTS:
        break

In [None]:
top_songs = {}

for top_artist_name, top_artist_link in top_artist_links.items():
    artist_response = requests.get(top_artist_link)

    artist_soup = BeautifulSoup(artist_response.content, "html.parser")

    song_no = 1
    found = False
    while not found:
        song_name = (
            artist_soup.find_all("table")[1]
            .find_all("tr")[song_no]
            .find("a")
            .get_text(strip=True)
        )

        # stars denote featuring
        # do not want this!
        if "*" not in song_name:
            song_link = (
                artist_soup.find_all("table")[1]
                .find_all("tr")[song_no]
                .find("a")["href"]
            )
            top_songs[top_artist_name] = song_link
            found = True

        else:
            song_no += 1

    time.sleep(DELAY)

In [None]:
assert (
    len(top_songs) == NUM_ARTISTS
), "The number of wanted and obtained songs do not match."
for song_url in top_songs.values():
    assert song_url.startswith(
        "https://open.spotify.com/track/"
    ), f"URL is off {song_url}"

In [None]:
with open("song_links.json", "w") as f:
    json.dump(top_songs, f)