# Scrapping data

In [7]:
import os

# Set the new working directory
new_path = r"C:\Users\aleja\OneDrive\Escritorio\Term_2\Text_Mining\final_project"

# Change the working directory
os.chdir(new_path)

# Verify the change
print("Current Working Directory:", os.getcwd())


Current Working Directory: C:\Users\aleja\OneDrive\Escritorio\Term_2\Text_Mining\final_project


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define the range of years to scrape
years = range(1981, 2022)

# List to store all data
all_data = []

for year in years:
    print(f"Scraping data for the year {year}...")
    
    # Construct the Wikipedia URL for the specific year
    url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
    
    try:
        # Request page content with UTF-8 encoding
        response = requests.get(url)
        response.raise_for_status()  # Raise an error if the request fails
        response.encoding = "utf-8"  # Ensure UTF-8 encoding

        # Parse page content
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find the first wikitable on the page
        table = soup.find("table", {"class": "wikitable"})
        
        # Extract table rows
        rows = table.find_all("tr")

        # Extract table data
        for row in rows[1:]:  # Skip header row
            cols = row.find_all("td")
            if len(cols) >= 3:  # Ensure row has the required columns
                ranking = cols[0].text.strip()
                title = cols[1].text.strip()
                artist = cols[2].text.strip()
                all_data.append([year, ranking, title, artist])
        
        # Sleep for 1 second to avoid overwhelming Wikipedia's servers
        time.sleep(1)

    except Exception as e:
        print(f"Failed to scrape data for {year}: {e}")

# Create DataFrame
df = pd.DataFrame(all_data, columns=["Year", "Ranking", "Title", "Artist(s)"])

# Rename columns
df.rename(columns={"Year": "year", "Ranking": "ranking", "Title": "title", "Artist(s)": "artist"}, inplace=True)

# Remove quotation marks from title column
df["title"] = df["title"].str.replace('"', '', regex=False)

# Save to CSV with UTF-8 encoding
df.to_csv("billboard_hot_100_1981_2021.csv", index=False, encoding="utf-8-sig")

print("Data saved to billboard_hot_100_1981_2021.csv")

# Display DataFrame
print(df)


Scraping data for the year 1981...
Scraping data for the year 1982...
Scraping data for the year 1983...
Scraping data for the year 1984...
Scraping data for the year 1985...
Scraping data for the year 1986...
Scraping data for the year 1987...
Scraping data for the year 1988...
Scraping data for the year 1989...
Scraping data for the year 1990...
Scraping data for the year 1991...
Scraping data for the year 1992...
Scraping data for the year 1993...
Scraping data for the year 1994...
Scraping data for the year 1995...
Scraping data for the year 1996...
Scraping data for the year 1997...
Scraping data for the year 1998...
Scraping data for the year 1999...
Scraping data for the year 2000...
Scraping data for the year 2001...
Scraping data for the year 2002...
Scraping data for the year 2003...
Scraping data for the year 2004...
Scraping data for the year 2005...
Scraping data for the year 2006...
Scraping data for the year 2007...
Scraping data for the year 2008...
Scraping data for th

In [12]:
# Save to CSV with UTF-8 encoding
df.to_csv("billboard_hot_100_1981_2021.csv", index=False, encoding="utf-8-sig")