In [2]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.30.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.30.0-py3-none-any.whl (9.4 MB)
   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
   --- ------------------------------------ 0.8/9.4 MB 11.2 MB/s eta 0:00:01
   ---------------- ----------------------- 3.9/9.4 MB 13.8 MB/s eta 0:00:01
   ------------------------------ --------- 7.1/9.4 MB 14.1 MB

In [3]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime, timedelta

def scrape_billboard_radio_songs_with_dates(start_date, end_date, weekly=True, preview_rows=3):
    """
    Scrape Billboard Streaming Songs for multiple dates
    
    Parameters:
    start_date (str): Start date in 'YYYY-MM-DD' format
    end_date (str): End date in 'YYYY-MM-DD' format
    weekly (bool): If True, capture weekly data, otherwise try every day (not recommended)
    preview_rows (int): Number of rows to show in preview (default 3)
    """
    # Convert string dates to datetime objects
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    
    # Configure Chrome to run without JavaScript
    chrome_options = Options()
    chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.javascript": 2})
    
    # Initialize the driver
    driver = webdriver.Chrome(options=chrome_options)
    
    # Initialize a list to store all data from all dates
    all_chart_data = []
    
    try:
        # Generate a list of dates to scrape
        current_date = start
        date_increment = timedelta(days=7) if weekly else timedelta(days=1)
        
        while current_date <= end:
            # Format the date for the URL
            formatted_date = current_date.strftime('%Y-%m-%d')
            
            # Build the URL with the date
            url = f"https://www.billboard.com/charts/streaming-songs/{formatted_date}/"
            print(f"\nScraping data for {formatted_date}...")
            
            # Access the page
            driver.get(url)
            
            # Wait for the page to load
            time.sleep(3)
            
            # Get the HTML content
            html_content = driver.page_source
            
            # Check if the page has a "Page not found" message
            if "Page Not Found" in html_content:
                print(f"No data available for {formatted_date}, skipping...")
                current_date += date_increment
                continue
            
            # Use BeautifulSoup to parse the HTML
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Find all song rows
            song_rows = soup.select('div.o-chart-results-list-row-container')
            print(f"Found {len(song_rows)} song rows for {formatted_date}")
            
            # If no songs found, skip this date
            if len(song_rows) == 0:
                print(f"No songs found for {formatted_date}, skipping...")
                current_date += date_increment
                continue
            
            # Track how many rows we've processed for preview
            rows_processed = 0
            
            # Process each song row
            for row in song_rows:
                try:
                    # Extract rank - the big number on the left
                    rank_elem = row.select_one('span.c-label.a-font-primary-bold-l')
                    rank = rank_elem.text.strip() if rank_elem else "N/A"
                    
                    # Extract title
                    title_elem = row.select_one('h3#title-of-a-story')
                    title = title_elem.text.strip() if title_elem else "N/A"
                    
                    # Extract artist
                    artist_elem = row.select_one('span.c-label.a-font-primary-s')
                    artist = artist_elem.text.strip() if artist_elem else "N/A"
                    
                    # All data spans in the row
                    all_spans = row.select('span.c-label')
                    
                    # Find numbers not associated with rank or labels
                    number_spans = []
                    for span in all_spans:
                        text = span.text.strip()
                        # Skip the rank number and artist
                        if span == rank_elem or span == artist_elem:
                            continue
                        if text.isdigit() or text == '★' or text == '-':
                            number_spans.append(text)
                    
                    # The last 3 numbers should be our statistics
                    last_week = peak_pos = weeks_on_chart = "N/A"
                    if len(number_spans) >= 3:
                        last_week = number_spans[-3]
                        peak_pos = number_spans[-2]
                        weeks_on_chart = number_spans[-1]
                    
                    # Special handling for the first song if statistics are missing
                    if rank == "1" and (last_week == "N/A" or peak_pos == "N/A" or weeks_on_chart == "N/A"):
                        # Try to find stats in the parent container
                        parent = row.find_parent('li')
                        if parent:
                            all_numbers = []
                            for elem in parent.select('span.c-label'):
                                text = elem.text.strip()
                                if text.isdigit() or text == '★' or text == '-':
                                    all_numbers.append(text)
                            
                            # Filter out the rank number
                            all_numbers = [n for n in all_numbers if n != rank]
                            
                            if len(all_numbers) >= 3:
                                last_week = all_numbers[-3]
                                peak_pos = all_numbers[-2]
                                weeks_on_chart = all_numbers[-1]
                    
                    # Add the data with the chart date
                    all_chart_data.append({
                        'Chart Date': formatted_date,
                        'Rank': rank,
                        'Title': title,
                        'Artist': artist,
                        'Last Week': last_week,
                        'Peak Position': peak_pos,
                        'Weeks on Chart': weeks_on_chart
                    })
                    
                    # Only print for the first preview_rows rows
                    rows_processed += 1
                    if rows_processed <= preview_rows:
                        print(f"Added: {formatted_date} - {rank}. {title} - {artist} | LW: {last_week} | Peak: {peak_pos} | Weeks: {weeks_on_chart}")
                    elif rows_processed == preview_rows + 1:
                        print(f"... (collecting remaining {len(song_rows) - preview_rows} songs) ...")
                    
                except Exception as e:
                    print(f"Error processing row: {e}")
            
            # Move to the next date
            current_date += date_increment
            
            # Add a small delay between requests to be considerate
            time.sleep(2)
        
        # Create a DataFrame for all the extracted data
        df = pd.DataFrame(all_chart_data)
        
        # Save the data to a CSV file
        output_filename = f'billboard_streaming_songs_{start_date}_to_{end_date}.csv'
        df.to_csv(output_filename, index=False)
        print(f"\nAll data saved to '{output_filename}'")
        
        return df
        
    finally:
        # Close the browser
        driver.quit()

if __name__ == "__main__":
    # Example usage: Scrape Billboard Streaming Songs charts for a 1-year period
    start_date = "2024-01-06"  # Format: YYYY-MM-DD
    end_date = "2024-12-28"    # Format: YYYY-MM-DD
    
    # You can change these dates to any period you're interested in
    scrape_billboard_radio_songs_with_dates(start_date, end_date, preview_rows=3)


Scraping data for 2024-01-06...
Found 50 song rows for 2024-01-06
Added: 2024-01-06 - 1. Rockin' Around The Christmas Tree - Brenda Lee | LW: 1 | Peak: 1 | Weeks: 50
Added: 2024-01-06 - 2. All I Want For Christmas Is You - Mariah Carey | LW: 2 | Peak: 1 | Weeks: 62
Added: 2024-01-06 - 3. Jingle Bell Rock - Bobby Helms | LW: 3 | Peak: 2 | Weeks: 43
... (collecting remaining 47 songs) ...

Scraping data for 2024-01-13...
Found 50 song rows for 2024-01-13
Added: 2024-01-13 - 1. Lovin On Me - Jack Harlow | LW: 19 | Peak: 1 | Weeks: 8
Added: 2024-01-13 - 2. I Remember Everything - Zach Bryan Featuring Kacey Musgraves | LW: 28 | Peak: 1 | Weeks: 19
Added: 2024-01-13 - 3. Cruel Summer - Taylor Swift | LW: 49 | Peak: 1 | Weeks: 37
... (collecting remaining 47 songs) ...

Scraping data for 2024-01-20...
Found 50 song rows for 2024-01-20
Added: 2024-01-20 - 1. Lovin On Me - Jack Harlow | LW: 1 | Peak: 1 | Weeks: 9
Added: 2024-01-20 - 2. I Remember Everything - Zach Bryan Featuring Kacey Musgrav