In [2]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import numpy as np
import datetime
from datetime import datetime, timedelta
import locale
import random
import time

In [102]:
# Define a function for scraping data from billboard hot100 songs 

def scrape_billboard_data(start_date, end_date):
    
    data = []
    current_date = start_date

    while current_date >= end_date:
        url = f"https://www.billboard.com/charts/hot-100/{current_date.strftime('%Y-%m-%d')}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        
        items = soup.find_all('li', class_='o-chart-results-list__item')

        for item in items:
            # Find the title and artist
            title_tag = item.find('h3', class_='c-title')
            artist_tag = item.find('span', class_='c-label')

            if title_tag and artist_tag:
                title = title_tag.text.strip()
                artist = artist_tag.text.strip()
                data.append((current_date.strftime('%Y-%m-%d'), title, artist))

        # Add random delay between 1 and 3 seconds
        delay = random.uniform(1, 3)
        time.sleep(delay)

        current_date -= timedelta(weeks=1)

    return data


In [None]:
#Scrape years 2000-2024
start_date = datetime(2024,4,20).date()
end_date = datetime(2000, 1, 1).date()

# Scrape data for multiple dates within the function
data = scrape_billboard_data(start_date, end_date)

# Create a DataFrame
df = pd.DataFrame(data, columns=['Date', 'Title', 'Artist'])


In [None]:
df.to_csv("data/billboard_2000-2024.csv")

In [None]:
# Scrape 1990-1999

start_date = datetime(1999,12,26).date()
end_date = datetime(1990, 1, 1).date()

# Scrape data for multiple dates within the function
data2 = scrape_billboard_data(start_date, end_date)

# Create a DataFrame
df2 = pd.DataFrame(data2, columns=['Date', 'Title', 'Artist'])

In [None]:
df2.to_csv("data/billboard_1999-1990.csv")

In [None]:
#Scrape 1980-1989

start_date = datetime(1989,12,30).date()
end_date = datetime(1980, 1, 1).date()

# Scrape data for multiple dates within the function
data3 = scrape_billboard_data(start_date, end_date)

# Create a DataFrame
df3 = pd.DataFrame(data3, columns=['Date', 'Title', 'Artist'])

In [None]:
df3.to_csv("data/billboard_1980-1989.csv")

In [None]:
#Scrape 1970-1979

start_date = datetime(1979,12,30).date()
end_date = datetime(1970, 1, 1).date()

# Scrape data for multiple dates within the function
data3 = scrape_billboard_data(start_date, end_date)

# Create a DataFrame
df3 = pd.DataFrame(data3, columns=['Date', 'Title', 'Artist'])

In [None]:
df3.to_csv("data/billboard_1970-1979.csv")

In [None]:
# Scrape 1960-1969

start_date = datetime(1969,12,30).date()
end_date = datetime(1960, 1, 1).date()

# Scrape data for multiple dates within the function
data4 = scrape_billboard_data(start_date, end_date)

# Create a DataFrame
df4 = pd.DataFrame(data4, columns=['Date', 'Title', 'Artist'])

In [None]:
df4.to_csv("data/billboard_1960-1969.csv")

In [None]:
# Scrape last available years 1958-1959

start_date = datetime(1959,12,30).date()
end_date = datetime(1958, 1, 1).date()

# Scrape data for multiple dates within the function
data5 = scrape_billboard_data(start_date, end_date)

# Create a DataFrame
df5 = pd.DataFrame(data4, columns=['Date', 'Title', 'Artist'])

In [None]:
df5.to_csv("data/billboard_1958-59.csv")

In [18]:
# Re-read all CSVs

df1= pd.read_csv("data/billboard_2000-2024.csv")
df2= pd.read_csv("data/billboard_1999-1990.csv")
df3= pd.read_csv("data/billboard_1980-1989.csv")
df4= pd.read_csv("data/billboard_1970-1979.csv")
df4= pd.read_csv("data/billboard_1970-1979.csv")
df5= pd.read_csv("data/billboard_1960-1969.csv")
df6= pd.read_csv("data/billboard_1958-59.csv")

In [19]:
# Concat all dfs, clean columns, and drop any null rows

df_full= pd.concat([df1,df2,df3,df4,df5, df6])
df_full.columns=df_full.columns.str.lower()
df_full= df_full.dropna(how="any")
df_full.shape

(387287, 4)

In [20]:
df_full=df_full.drop(columns="unnamed: 0")

In [21]:
df_full

Unnamed: 0,date,title,artist
0,2024-04-20,Like That,"Future, Metro Boomin & Kendrick Lamar"
1,2024-04-20,Too Sweet,Hozier
2,2024-04-20,Beautiful Things,Benson Boone
3,2024-04-20,Lose Control,Teddy Swims
4,2024-04-20,Texas Hold 'Em,Beyonce
...,...,...,...
52195,1960-01-05,"The Theme From ""A Summer Place""",Percy Faith And His Orchestra
52196,1960-01-05,Let The Good Times Roll,Ray Charles
52197,1960-01-05,Tell Her For Me,Adam Wade
52198,1960-01-05,Bulldog,The Fireballs


In [22]:
# Since I scraped weekly charts songs can repeat mutliple times for different dates, 
# I have to drop duplicates of title and artits

df_full = df_full.drop_duplicates( 
  subset = ['artist', 'title'], 
  keep = 'first').reset_index(drop = True) 

In [23]:
df_full.shape

(30376, 3)

In [25]:
# Check songs per year

df_full['date'] = pd.to_datetime(df_full['date'])
df_full['year'] = df_full['date'].dt.year

songs_per_year = df_full.groupby('year').size().reset_index(name='num_songs')
songs_per_year[songs_per_year['year']==1960]


Unnamed: 0,year,num_songs
0,1960,591


In [26]:
df1960=df_full[df_full['year']==1960]


In [28]:
df1960.shape

(591, 4)

In [29]:

df1960 = df1960.drop_duplicates( 
  subset = ['artist', 'title'], 
  keep = 'first').reset_index(drop = True) 

df1960.shape

(591, 4)

In [30]:
df_full['date'] = pd.to_datetime(df_full['date'])

df_full['year'] = pd.DatetimeIndex(df_full['date']).year

In [31]:
# Since I don't have songs from 2024 in my lyrics dataset I remove year 2024

df_full = df_full[df_full['year']<2023]

In [32]:
df_full.shape

(29417, 4)

In [203]:
df_full.to_csv("data/billboard_weekly_1958-2024.csv")