# **Scraping Top 100 Billboard**

In [2]:
import os

# Set the new working directory
#new_path = r"C:\Users\aleja\OneDrive\Escritorio\Term_2\Text_Mining\final_project"

# Change the working directory
#os.chdir(new_path)

# Verify the change
print("Current Working Directory:", os.getcwd())

Current Working Directory: c:\Users\Enzo\Documents\BSE\T2\TEXT_MINING\Final_Paper\Text_mining_final_project


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define the range of years to scrape
years = range(2004, 2025)

# List to store all data
all_data = []

for year in years:
    print(f"Scraping data for the year {year}...")
    
    # Construct the Wikipedia URL for the specific year
    url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
    
    try:
        # Request page content with UTF-8 encoding
        response = requests.get(url)
        response.raise_for_status()  # Raise an error if the request fails
        response.encoding = "utf-8"  # Ensure UTF-8 encoding

        # Parse page content
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find the first wikitable on the page
        table = soup.find("table", {"class": "wikitable"})
        
        # Extract table rows
        rows = table.find_all("tr")

        # Extract table data
        for row in rows[1:]:  # Skip header row
            cols = row.find_all("td")
            if len(cols) >= 3:  # Ensure row has the required columns
                ranking = cols[0].text.strip()
                title = cols[1].text.strip()
                artist = cols[2].text.strip()
                all_data.append([year, ranking, title, artist])
        
        # Sleep for 1 second to avoid overwhelming Wikipedia's servers
        time.sleep(1)

    except Exception as e:
        print(f"Failed to scrape data for {year}: {e}")

# Create DataFrame
df = pd.DataFrame(all_data, columns=["Year", "Ranking", "Title", "Artist(s)"])

# Rename columns
df.rename(columns={"Year": "year", "Ranking": "ranking", "Title": "title", "Artist(s)": "artist"}, inplace=True)

# Remove quotation marks from title column
df["title"] = df["title"].str.replace('"', '', regex=False)

# Save to CSV with UTF-8 encoding
df.to_csv("billboard_hot_100_2004_2024.csv", index=False, encoding="utf-8-sig")

print("Data saved to billboard_hot_100_2004_2024.csv")

# Display DataFrame
print(df)

Scraping data for the year 2004...
Scraping data for the year 2005...
Scraping data for the year 2006...
Scraping data for the year 2007...
Scraping data for the year 2008...
Scraping data for the year 2009...
Scraping data for the year 2010...
Scraping data for the year 2011...
Scraping data for the year 2012...
Scraping data for the year 2013...
Scraping data for the year 2014...
Scraping data for the year 2015...
Scraping data for the year 2016...
Scraping data for the year 2017...
Scraping data for the year 2018...
Scraping data for the year 2019...
Scraping data for the year 2020...
Scraping data for the year 2021...
Scraping data for the year 2022...
Scraping data for the year 2023...
Scraping data for the year 2024...
Data saved to billboard_hot_100_2004_2024.csv
      year ranking               title                                artist
0     2004       1               Yeah!  Usher featuring Lil Jon and Ludacris
1     2004       2                Burn                           

In [4]:
# Save to CSV with UTF-8 encoding
df.to_csv("billboard_hot_100_2004_2024.csv", index=False, encoding="utf-8-sig")