In [1]:
# Imports
import requests
import re
from bs4 import BeautifulSoup as bs4
import pandas as pd
import numpy as np
import nltk
pd.set_option('display.max_rows', None)

In [4]:
'''
Function -> ScrapeSongsOfYear() Scraping the section of the website needed and outputing the value 
    to continue working on it to avoid overloading server. The part of the website being scraped is 
    the year-end List table containing the Number the song ranked, the title of the song, and the artist.
    
(Input): A string URL of the website to scrape 
(Output): HTML to parse consisting of a list of 300 <td></td> tags.
        <td> number the song ranked </td>
        <td> song title </td>
        <td> song artist</td>
'''
def ScrapeSongsOfYear(YearSongsURL):
    page = requests.get(YearSongsURL)
    # Checking status code
    if page.status_code != 200: 
        return False
    soup = bs4(page.content, 'html.parser')
    section = soup.find(class_='mw-parser-output')
    table = section.find('table',{"class":"wikitable sortable"}) #specific table
    body = table.find('tbody')
    rows = body.find_all('tr')
    row = body.find_all('td')

    return row

In [12]:
'''
Function -> ExtractData() Parses the html and puts the information scraped into a dataframe. 
            Also takes the urls hyperlinked to the song name and artist name and appends them into a list.
            It accounts for when the song title or song artist has no link.
            
(Input): HTML list outputted from ScrapeSongsOfYear().
(Output): A df consisting of the Billboard Hit Number, the song title and the artist name.
          A list of URLs consisting of the song wikipedia page that was linked
          A list of URLs consisting of the artist wikipedia page that was linked
'''
def ExtractData(data):
    df = pd.DataFrame(columns=["Billboard Hit Number","Song","Artist","Genre"])
    links = []
    titleAndName = []
    
    for row in data:
        link_elem = row.find('a')
        info = row.get_text().strip()
        titleAndName.append(info)
        # If no link on artist/song, set as None to be able to correctly slice indices
        if link_elem is None: 
            links.append("NoneType") 
        else:
            link = link_elem.get('href')
            links.append("https://en.wikipedia.org"+link)
    
    df["Billboard Hit Number"]= titleAndName[::3] # Slice every 3rd element starting at 0th index (Hit Number)
    df["Song"]=titleAndName[1::3] # Slice every 3rd element starting at 1st index (Song title)
    df["Artist"]=titleAndName[2::3] # Slice every 3rd element starting at 2nd index (Song Artist)
    
    songlink = links[1::3]
    artistlink = links[2::3]

    return df,songlink,artistlink


In [14]:
'''
Function -> ExtractPronouns() Extracts the pronouns from a text scraped from the Artists wikipedia page to be 
            able to label the gender of the artist
            
(Input): HTML text from getArtistGender() function
(Output): A list of pronouns extracted from the text 
'''
# Extract just the pronouns from a text
nltk.download('punkt') 
def ExtractPronouns(text):
    sentences = nltk.sent_tokenize(text)
    # Tag the parts of speech in each sentence 
    tagged_sentences = [nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in sentences]
    
    # Extract the pronouns from the tagged sentences
    pronouns = []
    for tagged_sentence in tagged_sentences:
        for word, pos in tagged_sentence:
            if pos == 'PRP' or pos == 'PRP$' or word == "band" or word=="collaboration" or word=="group":
                pronouns.append(word.lower())
                
    # Only keep the word if its one of the below
    keep_list =['she','her','band','collaboration','group','him','his','he']
    for p in pronouns:
        if p not in keep_list:
            pronouns.remove(p)
    
    return pronouns


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/angeleparkcollin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
'''
Function -> getArtistGender() Scrapes the Gender of the artist by grabbing the introduction part of the text 
            on each wikipedia page. Then uses the ExtractPronouns function to extract just the pronouns in the 
            text to be able to label the artist more precisely.
            
(Input): A list of URLs of artist wikipedia pages
(Output): A list of the gender of each artist. 
'''
def getArtistGender(links):
    genders =[]
    for link in links:
        if link == "NoneType": # ie. No link attached to Artist name in the Billboard hits page
            genders.append('Unknown')
            continue
        page = requests.get(link)
        soup = bs4(page.content, 'html.parser')
        section = soup.find(class_='mw-body-content mw-content-ltr')

        intro = section.find_all('p')[1].text
        # If the intro is too short/non existent the grab the second section of the wikipedia text 
        if len(intro) < 5:
            intro = section.find_all('p')[2].text
        pronouns = ExtractPronouns(intro)
        if len(pronouns) == 0:
            pronouns = ['Unknown']
        # From the list outputted by ExtractPronouns function, take the first element and label as 'Female','Male','Group' accordingly.
        if pronouns[0] == "she" or pronouns[0] == "her":
            genders.append('Female')
        elif pronouns[0] == "his" or pronouns[0] == "him" or pronouns[0]== 'he':
            genders.append('Male')
        elif pronouns[0] =='band' or pronouns[0] =='collaboration'or pronouns[0] =='group':
            genders.append('Group')
        else:
            genders.append('Unknown')

    return genders

In [39]:
def getSongGenre(links, df, start_index=0):
    for link in links:
        if link == "NoneType": # ie. No link
            df.loc[start_index, "Genre"] = "None"
            continue
        page = requests.get(link)
        soup = bs4(page.content, 'html.parser')
        table = soup.find('table', class_='infobox')
        if table is None:
            df.loc[start_index, "Genre"] = "None"
        else:
            box = table.find('th', string='Genre') 
            if box is None:
                df.loc[start_index, "Genre"] = "None"
            else:
                genres = box.find_parent('tr').find('td')
                genre_names = [genre.strip() for genre in genres.text.split('\n') if genre.strip()]
                superscript_pattern = r'\[\d+\]'
                genre_names = [re.sub(superscript_pattern, '', text) for text in genre_names]
                df.loc[start_index, "Genre"] = genre_names[0]
            
        start_index += 1
    return df


In [22]:
'''
Function -> getSongGenre() Scrapes the song Genre by grabbing the genre section on the info box on each 
            wikipedia page. Takes just the first genre indicated as its the most common genre of the song.
            If theres no link then set Genre as none or if theres no infobox (1950s-1970s) then set as none. 
            
(Input): A list of URLs of song wikipedia pages, a dataframe{Billboard hit number, song, artist, genre, gender}, 
         and a start_index=0 to fill the dataframe at the correct index.
(Output): A dataframe filling out the song Genre column of the dataframe
'''
def getSongGenre(links, df, start_index=0):
    for link in links:
        if link == "NoneType": # ie. No link
            df.loc[start_index, "Genre"] = "None"
            start_index += 1
            continue
        page = requests.get(link)
        soup = bs4(page.content, 'html.parser')
        table = soup.find('table', class_='infobox')
        if table is None:
            df.loc[start_index, "Genre"] = "None"
        else:
            box = table.find('th', string='Genre') 
            if box is None:
                df.loc[start_index, "Genre"] = "None"
            else:
                genres = box.find_parent('tr').find('td')
                genre_names = [genre.strip() for genre in genres.text.split('\n') if genre.strip()]
                superscript_pattern = r'\[\d+\]'
                genre_names = [re.sub(superscript_pattern, '', text) for text in genre_names]
                df.loc[start_index, "Genre"] = genre_names[0]
            
        start_index += 1
    return df


In [25]:
# Run main - will take a while to run all ~ 15 mins?
'''
Run main() to get the combined dataframe of all the data the hits every decade from 1952-2022
Uncomment df.to_csv to start appending to file. I recommend changing the file name as it was append not overrite
'''
def main():
    # Scrape from wikipedia
    Y2022="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2022" #100 songs
    Y2012="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2012" #100 songs
    Y2002="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2002" #100 songs
    Y1992="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1992" #100 songs
    Y1982="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1982" #100 songs
    Y1972="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1972" #100 songs
    Y1962="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1962" #100 songs
    Y1952="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1952" #30 songs
    years = [Y2022,Y2012,Y2002,Y1992,Y1982,Y1972,Y1962,Y1952]
    for year in years:
        data = ScrapeSongsOfYear(year)
        df,songlinks,artistlink = ExtractData(data)
        genders = getArtistGender(artistlink)
        df['Genders'] = pd.Series(genders)
        getSongGenre(songlinks,df)
        display(df)
        # Append each df into csv file
#         df.to_csv('songsOfYear.csv', mode='a', index=False, header=False)

if __name__ == "__main__":
    main()


Unnamed: 0,Billboard Hit Number,Song,Artist,Genre,Genders
0,1,"""Stranger on the Shore""",Acker Bilk,Easy listening • Jazz,Male
1,2,"""I Can't Stop Loving You""",Ray Charles,Country,Male
2,3,"""Mashed Potato Time""",Dee Dee Sharp,"R&B, pop",Female
3,4,"""Roses Are Red (My Love)""",Bobby Vinton,Pop,Male
4,5,"""The Stripper""",David Rose,Jazz,Male
5,6,"""Johnny Angel""",Shelley Fabares,Pop,Female
6,7,"""The Loco-Motion""",Little Eva,"Pop, rhythm and blues",Female
7,8,"""Let Me In""",The Sensations,,Group
8,9,"""The Twist""",Chubby Checker,"Rock and roll, pop",Male
9,10,"""Soldier Boy""",The Shirelles,R&B,Group
