In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv

In [2]:
def fetch_hot100(year):
    csv_rows = [['Rank', 'Title', 'Artist(s)', 'Artists Separately']]

    url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_' + str(year)

    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')

    table = soup.find('table', {'class':'wikitable sortable'}).tbody

    num = 1
    for row in table.find_all('tr'):
        # skip empty rows
        cells_found = False
        if len(row.find_all('td')) == 3:
            num_cell, title_cell, artist_cell = row.find_all('td')
            cells_found = True
        
        if not cells_found:
            continue

        # extract title and artist from their cells
        title = title_cell.text.strip()

        artist = artist_cell.text.strip() # artists as a string
        artists_list = list(map(lambda x: x.getText(), artist_cell.find_all('a'))) # list of artists
        artists_separately = ";".join(artists_list)

        # create new row for the csv file
        row = [num, title, artist, artists_separately]
        csv_rows.append(row)

        num += 1
        # print(f'Title: {title}, Artist(s): {artist}', artists_list) # ! DEBUG

    # append all rows to new csv file
    csv_name = 'hot100files/' + str(year) + '.csv'
    with open(csv_name, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(csv_rows)

    print(f'Collected {year}')

In [19]:
START_YEAR = 1959
CUR_YEAR = 2023

In [None]:

for year in range(START_YEAR, CUR_YEAR):
    fetch_hot100(year)
        

In [3]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd

In [4]:
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [5]:
intToKey = {
    0:'C', 1:'C#/Db', 2:'D', 3:'D#/Eb', 4:'E', 5:'F', 6:'F#/Gb',
    7:'G', 8:'G#/Ab', 9:'A', 10:'A#/Bb', 11:'B', -1:'NA'
}

intToMode = {
    0:'minor', 1:'major'
}

In [6]:
year = 2022
hot100_path = f'hot100files/{year}.csv'
hot100_data = pd.read_csv(hot100_path)
hot100_data

Unnamed: 0,Rank,Title,Artist(s),Artists Separately
0,1,"""Heat Waves""",Glass Animals,Glass Animals
1,2,"""As It Was""",Harry Styles,Harry Styles
2,3,"""Stay""",The Kid Laroi and Justin Bieber,The Kid Laroi;Justin Bieber
3,4,"""Easy on Me""",Adele,Adele
4,5,"""Shivers""",Ed Sheeran,Ed Sheeran
...,...,...,...,...
95,96,"""Flower Shops""",Ernest featuring Morgan Wallen,Ernest;Morgan Wallen
96,97,"""To the Moon""",Jnr Choi and Sam Tompkins,
97,98,"""Unholy""",Sam Smith and Kim Petras,Sam Smith;Kim Petras
98,99,"""One Mississippi""",Kane Brown,Kane Brown


In [13]:
target = hot100_data.iloc[95]
target

Rank                                              96
Title                                 "Flower Shops"
Artist(s)             Ernest featuring Morgan Wallen
Artists Separately              Ernest;Morgan Wallen
Name: 95, dtype: object

In [14]:
query = f'track:{target["Title"]} artist:'
query += target['Artist(s)']
results = sp.search(q=query, limit=1, type='track')

In [16]:
results

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=track%3A%22Flower+Shops%22+artist%3AErnest+featuring+Morgan+Wallen&type=track&offset=0&limit=1',
  'items': [],
  'limit': 1,
  'next': None,
  'offset': 0,
  'previous': None,
  'total': 0}}

In [15]:
uri = results['tracks']['items'][0]['uri']
uri

IndexError: list index out of range

In [11]:
features = sp.audio_features([uri])
pd.DataFrame.from_dict(features, orient='columns')

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.761,0.525,11,-6.9,1,0.0944,0.44,7e-06,0.0921,0.531,80.87,audio_features,02MWAaffLxlfxAUY7c5dvx,spotify:track:02MWAaffLxlfxAUY7c5dvx,https://api.spotify.com/v1/tracks/02MWAaffLxlf...,https://api.spotify.com/v1/audio-analysis/02MW...,238805,4
