### Billboard US Webscrapping 

The Jupyter notebook is what we used to scrape the billboard hot 100 data from 2021 to 2023. 

In [2]:
# import scraping libraries
import requests                      # to request/get data from websites
from bs4 import BeautifulSoup as bs  # to parse and extract data from websites

# import other libraries
import time
import random
import pandas as pd
import numpy as np
import re
import datetime

In [3]:
# get the dates of all Saturdays in a given year in order to create the lists of URLS for scraping
def all_saturdays(year):
  dates = []
  """Generator that yields all Saturdays in a given year."""
  # January 1st of the given year
  dt = datetime.date(year, 1, 1)

  # First Saturday of the given year
  dt += datetime.timedelta(days=6 - dt.weekday())

  while dt.year == year:
    yield dt
    dt += datetime.timedelta(days=7)


In [13]:
# get the dates of all Saturdays in 2021 that kaggle data is not aviailable for
dt2021 = all_saturdays(2021)
y3 = [str(r) for r in dt2021]
y_gap = y3[45:52]

In [14]:
# get the dates of all Saturdays in 2022 and 2023
dt2023= all_saturdays(2023)
y1 = [str(r) for r in dt2023]
dt2022 = all_saturdays(2022)
y2 = [str(r) for r in dt2022]

dates = y_gap + y2 + y1

In [15]:
# create the list of URLs for scraping
urls = []
for date in dates:
    url = "https://www.billboard.com/charts/hot-100/" + date + "/"
    urls.append(url)


In [16]:
# web crawler to scrape the data from the URLs
def get_music_info (dates):
    data = []
    for date in dates:
        time.sleep(random.randint(0, 1))    
        url = "https://www.billboard.com/charts/hot-100/" + date + "/"
        header = { "User-Agent" : "small scraper for classroom purposes bao0211@uchicago.edu" }  # add your email
        response = requests.get(url, headers = header) 
        soup = bs(response.text, "html.parser")
        for e in soup.find_all(attrs={'class':'o-chart-results-list-row-container'}):
            temp_dict = {}
            for index, value in enumerate(e.find_all('span', class_='c-label')):
                temp_dict[index] = value
            
            data.append({'date':date,
                            'rank':e.find('span').get_text(strip=True),
                            'title':e.h3.get_text(strip=True),
                            'author':e.h3.find_next('span').get_text(strip=True),
                            'last_week':temp_dict[2].get_text(strip=True),
                            'peak':temp_dict[3].get_text(strip=True),
                            'total_week':temp_dict[4].get_text(strip=True)})
    return data

In [17]:
# get the data
data = get_music_info(dates)
data[2]

{'date': '2021-11-14',
 'rank': '3',
 'title': 'Industry Baby',
 'author': 'Lil Nas X & Jack Harlow',
 'last_week': '3',
 'peak': '1',
 'total_week': '16'}

In [187]:
import csv

In [None]:
# write the data to a csv file
fieldnames = data[0].keys()

# Specify the name of the CSV file
csv_filename = 'billboard_data_rest.csv'

# Write data to CSV file
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for row in data:
        writer.writerow(row)