In [177]:
import requests
import re
from bs4 import BeautifulSoup as bs4
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm
pd.set_option('display.max_rows', None)

In [207]:
def ScrapeSongsOfYear(YearSongsURL):
    page = requests.get(YearSongsURL)
    soup = bs4(page.content, 'html.parser')
    section = soup.find(class_='mw-parser-output')
    table = section.find('table',{"class":"wikitable sortable"}) #specific table
    body = table.find('tbody')
    rows = body.find_all('tr')
    row = body.find_all('td')

    return row

In [211]:
def ExtractData(data):
    df = pd.DataFrame(columns=["Index","Song","Artist","Genre"])
    links = []
    titleAndName = []
    
    for row in data:
        link_elem = row.find('a')
        info = row.get_text().strip()
        titleAndName.append(info)
        if link_elem is None: 
            links.append("NoneType") # If no link on artist/song
        else:
            link = link_elem.get('href')
            links.append("https://en.wikipedia.org"+link)
    
    df["Index"]= titleAndName[::3]
    df["Song"]=titleAndName[1::3]
    df["Artist"]=titleAndName[2::3]
    
    songlink = links[1::3]
    artistlink = links[2::3]

    return df,songlink,artistlink


In [212]:
#extract just the pronouns from a text
nltk.download('punkt') # download the necessary nltk data

def extract_pronouns(text):
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Use nltk's POS tagging to tag the parts of speech in each sentence
    tagged_sentences = [nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in sentences]
    
    # Extract the pronouns from the tagged sentences
    pronouns = []
    for tagged_sentence in tagged_sentences:
        for word, pos in tagged_sentence:
#             print(word,pos)
            if pos == 'PRP' or pos == 'PRP$' or word == "band" or word=="collaboration":
                pronouns.append(word.lower())
    
    keep_list =['she','her','band','collaboration','him','his','he']
    for p in pronouns:
        if p not in keep_list:
            pronouns.remove(p)
    
    return pronouns


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/angeleparkcollin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [213]:
def getArtistGender(links):
    genders =[]
    for link in links:
        if link == "NoneType": # ie. No link
            genders.append('Unknown')
            continue
        page = requests.get(link)
        soup = bs4(page.content, 'html.parser')
        section = soup.find(class_='mw-body-content mw-content-ltr')

        intro = section.find_all('p')[1].text
        
        if len(intro) < 5:
            intro = section.find_all('p')[2].text
        pronouns = extract_pronouns(intro)
        if len(pronouns) == 0:
            pronouns = ['Unknown']
        
        if pronouns[0] == "she" or pronouns[0] == "her":
            genders.append('Female')
        elif pronouns[0] == "his" or pronouns[0] == "him" or pronouns[0]== 'he':
            genders.append('Male')
        elif pronouns[0] =='band' or pronouns[0] =='collaboration':
            genders.append('Group')
        else:
            genders.append('Unknown')

    return genders

In [214]:
def getSongGenre(links, df, start_index=0):
    for link in links:
        page = requests.get(link)
        soup = bs4(page.content, 'html.parser')
        table = soup.find('table', class_='infobox').find('th', string='Genre') 
        if table is None:
            df.loc[start_index, "Genre"] = "None"
        else:
            genres = table.find_parent('tr').find('td')
            genre_names = [genre.strip() for genre in genres.text.split('\n') if genre.strip()]
            superscript_pattern = r'\[\d+\]'
            genre_names = [re.sub(superscript_pattern, '', text) for text in genre_names]
            df.loc[start_index, "Genre"] = genre_names[0]
            
        start_index += 1
    return df

In [223]:
#Rescrapre from wikipedia
Y2022="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2022"
Y2012="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2012"
Y2002="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2002"
Y1992="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1992"
Y1982="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1982"
Y1972="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1972"
Y1962="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1962"
Y1952="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1952"
Y1942="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1942"
# years = [Y2022,Y2012,Y2002,Y1992,Y1982,Y1972,Y1962,Y1952,Y1942]
years = [Y2022,Y2012]
for year in years:
    data = ScrapeSongsOfYear(year)
    df,songlinks,artistlink = ExtractData(data)
    genders = getArtistGender(artistlink)
    df['Genders'] = pd.Series(genders)
    getSongGenre(songlinks,df)

    # Put df into csv file
    df.to_csv('songsOfYear.csv', mode='a', index=False, header=False)


In [216]:
# df,songlinks,artistlink = ExtractData(data)

In [217]:
# genders = getArtistGender(artistlink)
# df['Genders'] = pd.Series(genders)

In [218]:
# getSongGenre(songlinks,df)

Unnamed: 0,Index,Song,Artist,Genre,Genders
0,1,"""Somebody That I Used to Know""",Gotye featuring Kimbra,Art pop,Male
1,2,"""Call Me Maybe""",Carly Rae Jepsen,Pop,Female
2,3,"""We Are Young""",Fun featuring Janelle Monáe,Stadium rockindie rockindie pop,Group
3,4,"""Payphone""",Maroon 5 featuring Wiz Khalifa,Pop,Group
4,5,"""Lights""",Ellie Goulding,Synth-pop,Female
5,6,"""Glad You Came""",The Wanted,Eurodance,Group
6,7,"""Stronger (What Doesn't Kill You)""",Kelly Clarkson,Dance-pop,Female
7,8,"""We Found Love""",Rihanna featuring Calvin Harris,Electro housedance-pop,Female
8,9,"""Starships""",Nicki Minaj,Dance-pop,Female
9,10,"""What Makes You Beautiful""",One Direction,Power pop,Group


In [219]:
# # Put df into csv file
# df.to_csv("songsofYear.csv", index=False)