Use this notebook to scrape weekly top 40 charts

In [None]:
# artist search
# https://www.songkick.com/developer/artist-search

In [1]:
# dependencies
import pandas as pd
import re
import timeit
from IPython.display import clear_output
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# create a time series for charts
ts = pd.date_range(start='1997-07-07', end='2022-10-12', freq='W-MON', inclusive='both', normalize=True)
#ts = pd.date_range(start='1997-07-07', end='1997-07-14', freq='W', closed=None, normalize=True)
ts = pd.Series(ts)

# create list of base URLs for each chart
chart_url = 'https://top40-charts.com/chart.php?cid=27&date='
urls = [chart_url] * 1317


# save to dataframe
url_df = pd.DataFrame()
url_df['base_url'] = urls
url_df['date'] = ts

# concatenate columns to get full URLs for each weekly chart
url_df['url'] = url_df['base_url'].astype(str) + url_df['date'].astype(str)
url_df.drop(['base_url', 'date'], axis=1, inplace=True)

url_df.head()

Unnamed: 0,url
0,https://top40-charts.com/chart.php?cid=27&date...
1,https://top40-charts.com/chart.php?cid=27&date...
2,https://top40-charts.com/chart.php?cid=27&date...
3,https://top40-charts.com/chart.php?cid=27&date...
4,https://top40-charts.com/chart.php?cid=27&date...


In [7]:
# function to scrape chart data 
def get_top40_charts(weekly_charts):
    
    # start timer
    start = timeit.default_timer()
    
    # set up an empty dataframe to hold the data
    top40_df = pd.DataFrame()
    top40_df['week'] = []
    top40_df['position'] = []
    top40_df['song'] = []
    top40_df['artist'] = []

    # set up the browser
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)

    for w in range(len(weekly_charts)):
        
        # navigate to website
        url = weekly_charts['url'][w]
        browser.visit(url)
        
        # parse the html
        html = browser.html
        html_soup = soup(html, 'html.parser')
        
        # list of html data for all chart positions
        chart_rows = html_soup.find_all('tr', class_='latc_song')
        
        # empty lists to hold chart data
        position = []
        artist = []
        title = []
        
        for row in chart_rows:
            
            # position is saved as 'chid' attribute
            position.append(row['chid'])
            
            # extract song titles using regex
            row_a = row.find_all('a')
            row_title = re.search('(?:;" title="View song details">)(.+)(?:</a>, <a href)', str(row_a)).group(1)
            title.append(row_title)
            
            # extract artist names using regex
            # depending on the date, artist name is located in different places,
            # so using a try/except blocks to use whichever works
            
            # empty list to hold multiple artist name searches
            artist_tries = []
            
            try:
                row_artist_1 = re.search('(?:>)(.+)(?:</a>)', str(row_a[2])).group(1)
                artist_tries.append(row_artist_1)            
            except:
                pass
            try:
                row_artist_2 = re.search('(?:>)(.+)(?:</a>)', str(row_a[1])).group(1)
                artist_tries.append(row_artist_2)
            except:
                pass

            try:
                row_artist_3 = re.search('(?:style="text-decoration: none; ">)(.+)(?:</a>, <a chid)', str(row_a)).group(1)
                artist_tries.append(row_artist_3)
            except:
                pass
            
            # if the artist search captured the song title, remove it from the list
            artist_tries.remove(row_title)
            
            # keep the shortest string search result that worked: 
            # some searches capture html attributes, resulting in long strings - ignore those
            # but this won't work if only one search worked, so use try/except
            try:
                row_artist = min(artist_tries, key=len)
            except:
                row_artist = artist_tries
            
            # save the best artist search to the list
            artist.append(row_artist)
        
        # create datetime data
        url_date = re.search('date=(\d{4}-\d{2}-\d{2})',url).group(1)
        date_list = [url_date] * len(artist)
        date_list = pd.to_datetime(date_list)
        
        # save the week's charts to a dataframe
        current_df = pd.DataFrame()
        current_df['week'] = date_list
        current_df['position'] = position
        current_df['song'] = title
        current_df['artist'] = artist
        current_df.head()
        
        # append the week's dataframe to the master dataframe
        top40_df = pd.concat([top40_df, current_df])
        
        # log the progress
        clear_output()
        checkpoint = timeit.default_timer()
        print(f'Last saved date: {url_date}')
        print(f'{(w+1)}/{len(weekly_charts)} charts scraped')
        print(f'{(w+1)/len(weekly_charts)*100:.2f}% complete')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed...')
        
    # end timer, log results
    clear_output()
    stop = timeit.default_timer()
    print(f'Last saved date: {url_date}')
    print(f'{(w+1)/len(weekly_charts)*100:.2f}% complete')
    print(f'Total runtime: {(stop-start)/60:.2f} minutes')
    print(f"{len(top40_df)} total songs and {len(top40_df)/40:.0f} total weeks scraped")
    
    # results
    return top40_df

In [14]:
top40_2 = get_top40_charts(url_df)

top40_2.sample(10)

Last saved date: 1997-11-24
21/1317 charts scraped
1.59% complete
0.52 minutes elapsed...


KeyboardInterrupt: 

In [9]:
top40_2.to_csv("../00_data/top40_1997_2022_raw2.csv", index=False)

In [13]:
len(top40_2)

5720

In [619]:
url_df.url[142]

'https://top40-charts.com/chart.php?cid=27&date=2000-03-27'