Use this notebook to scrape weekly top 40 charts
- USA Top 40 singles
- https://top40-charts.com/chart.php?cid=27

In [26]:
# dependencies
import pandas as pd
import re
import timeit
from IPython.display import clear_output
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [106]:
# create a time series for charts
# note: chart weekday changes after 03/27/2000, need to concatenate two time series into one
ts1 = pd.date_range(start='1997-07-07', end='2000-03-27', freq='W-MON', inclusive='both', normalize=True).to_series()
ts2 = pd.date_range(start='2000-04-01', end='2022-10-15', freq='W-SAT', inclusive='both', normalize=True).to_series()

ts = pd.concat([ts1, ts2])
# create a base URL for all charts
chart_url = 'https://top40-charts.com/chart.php?cid=27&date='

# concatenate URLs and dates to get full URLs for each weekly chart
urls = [] # empty list to hold full URLs
for w, date in enumerate(ts):
    
    ts_date = re.search('(\d{4}-\d{2}-\d{2})(?:\s)', str(ts[w])).group(1)
    url_date = chart_url + ts_date
    urls.append(url_date)

In [104]:
# function to scrape chart data 
# input: a list of weekly chart URLs to scrape

def get_top40_charts(weekly_charts):
    
    # start timer
    start = timeit.default_timer()
    
    # set up an empty dictionary to hold the final dataset
    top40_data = []

    # counter to keep track of total songs scraped
    total_songs = 0

    # set up the browser
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)

    for w, chart in enumerate(weekly_charts):
        
        # get the chart's date
        url_date = re.search('date=(\d{4}-\d{2}-\d{2})', chart).group(1)

        # navigate to website
        browser.visit(chart)
        
        # parse the html
        chart_html = browser.html
        chart_soup = soup(chart_html, 'html.parser')
        
        # list of html data for all chart positions
        chart_rows = chart_soup.find_all('tr', class_='latc_song')
        
        # empty lists to hold chart data
        position = []
        artist = []
        title = []
        
        for row in chart_rows:
            
            # position is saved as 'chid' attribute
            position.append(row['chid'])
            
            # extract song titles using regex
            row_a = row.find_all('a')
            row_title = re.search('(?:;" title="View song details">)(.+)(?:</a>, <a href)', str(row_a)).group(1)
            title.append(row_title)
            
            # extract artist names using regex
            # depending on the date, artist name is located in different places,
            # so using a try/except blocks to use whichever works
            
            # empty list to hold multiple artist name searches
            artist_tries = []
            
            try:
                row_artist_1 = re.search('(?:>)(.+)(?:</a>)', str(row_a[2])).group(1)
                artist_tries.append(row_artist_1)            
            except Exception as e:
                print(f'{url_date} regex try1 error: {e}')
                pass
            
            try:
                row_artist_2 = re.search('(?:>)(.+)(?:</a>)', str(row_a[1])).group(1)
                artist_tries.append(row_artist_2)
            except Exception as e:
                print(url_date)
                print(f'{url_date} regex try2 error: {e}')
                pass

            try:
                row_artist_3 = re.search('(?:style="text-decoration: none; ">)(.+)(?:</a>, <a chid)', str(row_a)).group(1)
                artist_tries.append(row_artist_3)
            except Exception as e:
                print(url_date)
                print(f'{url_date} regex try3 error: {e}')
                pass
            
            # if the artist search captured the song title, remove it from the list
            artist_tries.remove(row_title)
            
            # keep the shortest string search result that worked: 
            # some searches capture html attributes, resulting in long strings - ignore those
            # but this won't work if only one search worked, so use try/except
            try:
                row_artist = min(artist_tries, key=len)
            except:
                row_artist = artist_tries
            
            # save the best artist search to the list
            artist.append(row_artist)
        
        
        # create datetime data
        date_list = [url_date] * len(artist)
        #date_list = pd.to_datetime(date_list)

        # save the week's charts to a dictionary
        current_chart = {
            'week' : date_list,
            'position' : position,
            'song' : title,
            'artist' : artist
        }
        
        # track how many songs have been scraped
        total_songs += len(title)

        # save the week's chart 
        top40_data.append(current_chart)
        
        
        # log the progress
        clear_output()
        checkpoint = timeit.default_timer()
        print(f'Last saved date: {url_date}')
        print(f'{(w+1)}/{len(weekly_charts)} charts scraped')
        print(f'{(w+1)/len(weekly_charts)*100:.2f}% complete')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed...')
        

    # save all data to a dataframe
    final_df = pd.concat(pd.DataFrame(chart_dict) for chart_dict in top40_data)

    # end timer, log results
    clear_output()
    stop = timeit.default_timer()
    print(f'Last saved date: {url_date}')
    print(f'{(w+1)/len(weekly_charts)*100:.2f}% complete')
    print(f'Total runtime: {(stop-start)/60:.2f} minutes')
    print(f"{total_songs:.0f} total songs and {len(top40_data):.0f} total weeks scraped")
    
    # results
    return final_df

In [107]:
top40_df = get_top40_charts(urls)
top40_df.sample(10)

Last saved date: 2022-10-15
100.00% complete
Total runtime: 41.45 minutes
52768 total songs and 1320 total weeks scraped


Unnamed: 0,week,position,song,artist
4,2000-04-22,5,I Try,Macy Gray
19,2002-12-28,20,I'm With You,Avril Lavigne
4,1998-04-06,5,3 Am,Matchbox 20
32,1998-01-26,33,"No, No, No",Destiny's Child
18,2022-06-18,19,Despues De La Playa,Bad Bunny
27,2008-05-03,28,Realize,Colbie Caillat
2,2000-06-24,3,Be With You,Enrique Iglesias
17,2017-03-04,18,24K Magic,Bruno Mars
14,2006-07-15,15,So What,Field Mob &amp; Ciara
8,2009-06-20,9,Permanent,David Cook


In [109]:
top40_df.to_csv("../00_data/top40_1997_2022_raw.csv", index=False)