In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

In [183]:
## FUNCTIONS and VARIABLES

def firstrowstats(row):
    '''
    returns stats of the first row on a given page (because HTML tags differ from all other rows)
    
    Arguments:
    row -- the soup data for the first row on a given page
    '''
    
    # get the rank value [1-100] (#1 spot may be blank, so need to check what the number is)
    rank = row.find("span", {'class': "c-label a-font-primary-bold-l u-font-size-32@tablet u-letter-spacing-0080@tablet"}).contents[0].strip()
    # get the title of the song for the first row
    title = row.find("h3", {'id': "title-of-a-story"}).contents[0].strip()
    # get the artist name for the first row
    artist = row.find("span", {'class': "c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only u-font-size-20@tablet"}).contents[0].strip()
    # get the last week rank, peak postition, and the # of weeks the song has been on the charts
    lastweek, peak, weekcount = [i.contents[0].strip() for i in row.find_all('span', {'class':'c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet'})[:3]]
    return [rank, title, artist, lastweek, peak, weekcount]
    
#returns stats of row besides row 0
def rowstats(row):
    '''
    returns stats of the given row (will never be row 1)
    
    Arguments:
    row -- the soup data for the given row
    '''
    # get the rank value of the entry [1-100]
    rank = row.find("span", {'class': "c-label a-font-primary-bold-l u-font-size-32@tablet u-letter-spacing-0080@tablet"}).contents[0].strip()
    # get the title of the song for the given row
    title = row.find("h3", {'id': "title-of-a-story"}).contents[0].strip()
    # get the artist name for the given row
    artist = row.find("span", {'class': "c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only"}).contents[0].strip()
    # get the last week rank, peak postition, and the # of weeks the song has been on the charts
    lastweek, peak, weekcount = [i.contents[0].strip() for i in row.find_all('span', {'class':'c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max'})[:3]]
    return [rank, title, artist, lastweek, peak, weekcount]
    
#reutrns stats of page    
def pagestats(df, data, date):
    '''
    adds the stats of a given page to the dataframe
    
    Arguments:
    df -- dataframe that we use to store the extracted results
    data -- soup/requests data from a given page
    date -- date of the page (YYYY-MM-DD)
    '''
    
    # Gets list of soup for all rows of data, with each row as a list entry
    rows = data.find_all("div", {"class":"o-chart-results-list-row-container"} )
    # adds new row to the datafram for the first song on this page
    df.loc[len(df.index)] = [date]+firstrowstats(rows[0])
    
    # for every other entry in the Billboard top 100 for this date, add a row 
    for i in range(1,len(rows)):
        df.loc[len(df.index)] = [date]+rowstats(rows[i])
        
def getdata(df, weeklist):
    '''
    Loops through every date in the list and grabs billboard data soup 
    from that week and puts it in the dataframe
    
    Arguments
    df -- dataframe to store the data in
    weeklist -- list of weeks from the desired start and end time    
    '''
    
    for week in weeklist:
        date=week.strftime('%Y-%m-%d')
        print(f"INIT: request page {date}")
        r = requests.get(f'https://www.billboard.com/charts/hot-100/{date}')
        data = BeautifulSoup(r.content)
        pagestats(df, data, week)

def updatedata(df):
    '''
    Updates the data in the dataframe with all new info.
    If empty, it gets all of the data
    
    Arguments:
    df -- DataFrame used to store the data
    '''
    
    # if empty get all billboard data
    if len(df['Date']) == 0:
        gatheralldata(df)
    # else, add only new data to the dataframe
    else:
        maxdate= max(df['Date'].value_counts().keys())
        datelist = pd.date_range(end = datetime.today(), start = maxdate + timedelta(days=7), freq='W-SAT')
        print(f"INIT: getdata from {maxdate}")
        getdata(df, datelist)
    
def gatheralldata(df):
    '''
    Gets all data from the begining of the Billboard dataset
        from 1958-08-02 to last saturday
        
    Arguments:
    df -- DataFrame used to store the data
    '''
    
    earliestDate="1958-08-02"
    dates = pd.date_range(end = datetime.today(), start = datetime.strptime(earliestDate, '%Y-%m-%d'), freq='W-SAT')
    getdata(df, dates)
    
def createtable():
    '''
    Creates a new df with the columns in the dataset
    '''
    
    col = ["Date", "Rank", "Title", "Artist", "LastWeek", "Peak", "WeeksOnChart"]
    return pd.DataFrame(columns= col)

In [184]:
df1=createtable()
df1

Unnamed: 0,Date,Rank,Title,Artist,LastWeek,Peak,WeeksOnChart


In [185]:
gatheralldata(df1)

INIT: request page 1958-08-02
INIT: request page 1958-08-09
INIT: request page 1958-08-16
INIT: request page 1958-08-23
INIT: request page 1958-08-30
INIT: request page 1958-09-06
INIT: request page 1958-09-13


KeyboardInterrupt: 

In [186]:
df1

Unnamed: 0,Date,Rank,Title,Artist,LastWeek,Peak,WeeksOnChart
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,-,1,1
1,1958-08-02,2,Patricia,Perez Prado And His Orchestra,-,2,1
2,1958-08-02,3,Splish Splash,Bobby Darin,-,3,1
3,1958-08-02,4,Hard Headed Woman,Elvis Presley With The Jordanaires,-,4,1
4,1958-08-02,5,When,Kalin Twins,-,5,1
...,...,...,...,...,...,...,...
595,1958-09-06,96,My Lucky Love,Doug Franklin With The Bluenotes,-,96,1
596,1958-09-06,97,Endless Sleep,Jody Reynolds,92,17,6
597,1958-09-06,98,Leroy,Jack Scott,89,56,5
598,1958-09-06,99,Angel Baby,Dean Martin,88,30,6


In [3]:
# Get list of all weeks from begining to today
earliestDate="1958-08-02"
dates = pd.date_range(end = datetime.today(), start = datetime.strptime(earliestDate, '%Y-%m-%d'), freq='W-SAT')

# # Get last saturday in YYYY-MM-DD format
# today = datetime.today()
# week = today- timedelta(days=(today.weekday()+2)%7)
# week = week.strftime('%Y-%m-%d')

# Create df with columns named
col = ["Date", "Rank", "Title", "Artist"]
df = pd.DataFrame(columns= col)

getdata(dates)

# # Request data for Billboard top 100 today
# URL = ('https://www.billboard.com/charts/hot-100/')
# r = requests.get(URL)
# data = BeautifulSoup(r.content)

# # Get data for the first page
# pagestats(data, week)

In [67]:
# There are some missing values: why?
dateCounts = df['Date'].value_counts()[df['Date'].value_counts().values<=99]
pd.DataFrame(dateCounts.keys())

# They are not available on the billboard website (does not exist)

Unnamed: 0,0
0,1977-01-08
1,1976-12-25
2,1977-02-26
3,1977-02-19
4,1976-12-11
5,1977-01-01
6,1976-12-18
7,1977-01-22
8,1977-01-15
9,1977-02-12


In [94]:
maxdate= max(df['Date'].value_counts().keys())
dates = pd.date_range(end = datetime.today(), start = maxdate + timedelta(days=7), freq='W-SAT')
dates

DatetimeIndex([], dtype='datetime64[ns]', freq='W-SAT')

In [47]:
dateCounts = df['Date'].value_counts()[df['Date'].value_counts().values<=99]
pd.DataFrame(dateCounts.keys())
# type(df['Date'])

Unnamed: 0,0
0,1977-01-08
1,1976-12-25
2,1977-02-26
3,1977-02-19
4,1976-12-11
5,1977-01-01
6,1976-12-18
7,1977-01-22
8,1977-01-15
9,1977-02-12


In [55]:
# df['Date'].isin(dateCounts.keys())
len(df[df['Date'].isin(dateCounts.keys())])

1287

In [60]:
len(df['Rank'].value_counts()[df['Rank'].value_counts().values<=3368])

98

In [9]:
df.to_csv('Data/billboard100.csv')

In [108]:
r = requests.get(f'https://www.billboard.com/charts/hot-100/')
data = BeautifulSoup(r.content)
df2 = pd.DataFrame()
week = datetime.today()
rows = data.find_all("div", {"class":"o-chart-results-list-row-container"} )

In [130]:
rows[2].find("circle").attrs['fill']
colors[rows[2].find("circle").attrs['fill']]

'Increase'

In [164]:
#last week
rows[2].find('span', {'class':'c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max'}).contents[0].strip()
rows[0].find('span', {'class':'c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet'}).contents[0].strip()


'1'

In [170]:
rows[2].find('span', {'class':'c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max'}).contents[0].strip()
rows[0].find('span', {'class':'c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet'}).contents[0].strip()


'1'

In [165]:
rows[2].find('span', {'class':'c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max'}).contents[0].strip()
rows[0].find('span', {'class':'c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet'}).contents[0].strip()


'1'

In [175]:
rows[2].find_all('span', {'class':'c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max'})
# rows[0].find('span', {'class':'c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet'}).contents[0].strip()
lastweek, peak, weekcount = [i.contents[0].strip() for i in rows[2].find_all('span', {'class':'c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max'})[:3]]

In [182]:
print(lastweek, peak, weekcount)

1 1 3


In [180]:
# lastweek, peak, weekcount = [i.contents[0].strip() for i in row.find_all('span', {'class':'c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max'})[:3]]
lastweek, peak, weekcount = [i.contents[0].strip() for i in row.find_all('span', {'class':'c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet'})[:3]]