# Import Statements

In [1]:
import time
import requests

import pandas as pd
from bs4 import BeautifulSoup

# Web Scrape

This projects aim is to build a content-based recommendation model to allow users to input a song, preferably one recently made, and return recommendations for songs with similar qualities that have been on the Billboard Hot 100. The flow of this project will be:

1. Scrape Wiki for the Hot 100 data that is needed (song, artist, year on Hot 100)
2. Use Spotify's WebAPI to gather metadata (basic track metadata, track features, and track analysis) for every song on the Hot 100 list that Spotify has data for
3. Get the song the user would like recommendations for, along with an optional decade if the user chooses one, and gather the same metadata as the Hot 100 songs, add this song temporarily to the dataset.
4. Find the cosine similarity score for the dataset and sort data to provide the user with the top 10 songs most similar to the one that was input.

To start this project, I need to collect my data. I will be using the data from Wikipedia for Billboard's Hot 100 for each year from 1960 through 1999 as my main reference data when making recommendations.

In [3]:
# Create the initial dataframe to store data outside loop
data = pd.DataFrame()

for d in range(6, 10):
    for y in range(10):

        # Getting the connection to the Billboard Hot 100 
        # for each year starting with 1960.
        wikiurl = "https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_19" + str(d) + str(y)
        response = requests.get(wikiurl)
        
        if response.status_code not in range(200, 299):
            raise Exception(f"Wiki Request Error: {response.status_code}")
        
        # Create our soup and parse out the tabels 
        # containing the billboard list of songs.
        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table', {'class': "wikitable"})
        # The needed table is the second table in tables
        billboard_table = tables[1]

        df = pd.read_html(str(billboard_table))
        # Do a check to see if this is my dataframe, if so assign it
        # to the data variable. If not concat with the data variable.
        if y == 0 and d == 6:
            data = pd.DataFrame(df[0])
            # Add in the a column for the year the 
            # for outside dataframe song was on billboard list
            data['year'] = int('19' + str(d) + str(y))
        else:
            df = pd.DataFrame(df[0])
            # Add in the a column for the year the 
            # for inside dataframe song was on billboard list
            df['year'] = int('19'+ str(d) + str(y))
            data = pd.concat([data, df])
        
        # sanity check to make sure pages are being scrapped
        print(f"Done with year 19{str(d)}{str(y)}")
        
        # Wait 3 seconds before getting another pull
        time.sleep(3)

Done with year 1960
Done with year 1961
Done with year 1962
Done with year 1963
Done with year 1964
Done with year 1965
Done with year 1966
Done with year 1967
Done with year 1968
Done with year 1969
Done with year 1970
Done with year 1971
Done with year 1972
Done with year 1973
Done with year 1974
Done with year 1975
Done with year 1976
Done with year 1977
Done with year 1978
Done with year 1979
Done with year 1980
Done with year 1981
Done with year 1982
Done with year 1983
Done with year 1984
Done with year 1985
Done with year 1986
Done with year 1987
Done with year 1988
Done with year 1989
Done with year 1990
Done with year 1991
Done with year 1992
Done with year 1993
Done with year 1994
Done with year 1995
Done with year 1996
Done with year 1997
Done with year 1998
Done with year 1999


In [4]:
data

Unnamed: 0,No.,Issue date,Song,Artist(s),Reference,year,Ref.,Issue date[1]
0,24,January 4,"""El Paso""",Marty Robbins,[1],1960,,
1,24,January 11,"""El Paso""",Marty Robbins,[2],1960,,
2,25,January 18,"""Running Bear""",Johnny Preston,[3],1960,,
3,25,January 25,"""Running Bear""",Johnny Preston,[4],1960,,
4,25,February 1,"""Running Bear""",Johnny Preston,[5],1960,,
...,...,...,...,...,...,...,...,...
47,850,November 27,"""Smooth""",Santana featuring Rob Thomas,,1999,[50],
48,850,December 4,"""Smooth""",Santana featuring Rob Thomas,,1999,[51],
49,850,December 11,"""Smooth""",Santana featuring Rob Thomas,,1999,[52],
50,850,December 18,"""Smooth""",Santana featuring Rob Thomas,,1999,[53],


In [5]:
data['year'].unique()

array([1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
       1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
       1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
       1993, 1994, 1995, 1996, 1997, 1998, 1999], dtype=int64)

The Issue date looks to be the issue of the Billboard chart when the song first made #1 on the charts, and the date when they lost their #1 status. As of right now, I do not believe this information would be relevant to me. I could see me using this potentially to create a feature 'weighted score' which could be used as a determinant for how popular a song was when deciding if it would be appropriate to recommend. However, I think I will remove the column right now and come back to it later if needed. I would also like to remove the Reference and Ref. columns as well as the Issue Date[1], and No. column. These columns will be of no use to me in my recommender. 

In [6]:
# Come back to this later if you feel like 
# you need length of time as #1 on the charts
data.drop(['No.', 'Issue date', 'Reference', 'Ref.', 'Issue date[1]'],\
          axis=1, inplace=True)

In [7]:
data.head()

Unnamed: 0,No.,Song,Artist(s),year
0,24,"""El Paso""",Marty Robbins,1960
1,24,"""El Paso""",Marty Robbins,1960
2,25,"""Running Bear""",Johnny Preston,1960
3,25,"""Running Bear""",Johnny Preston,1960
4,25,"""Running Bear""",Johnny Preston,1960


The Hot 100 officially began on Monday, August 4, 1958 and the No. column records the order of songs that have made it onto the Hot 100. This may be useful as an index further down the line so I will keep it in the dataframe. The next and last thing I want to do to clean this data is to remove all duplicate entries. We have duplicate entries now because of the original data scrape including both the date the song was made #1 on the Billboard Hot 100 and the day it left #1. Additionally, some songs returned to the Hot 100 in multiple years, causing them to be entered another time with their position in the now removed 'No.' column being recorded as 're'. Since I have removed the column corresponding to that there isn't a need for the duplicate entries.

When removing duplicates I wanted to be careful not to remove entries with the same song name but different artists. Over the years there have been a number of songs that have been re-recorded by new artists years after their original fame. At times, the re-recording can even have an entirely different sound then the previous version. This would provide much different metadata for each of the versions thus improving our recommender. As such, I will remove duplicates entries where there is a matching song name and artist, rather then just simply by song name. I will disregard the year column as I would be fine losing songs performed by the same artist that appear in the Hot 100 in multiple years at this time.

In [11]:
data.drop_duplicates(subset=['Song', 'Artist(s)'], inplace=True)

data.reset_index(drop=True, inplace=True)

pd.set_option('display.max_rows', None)
data

Unnamed: 0,Song,Artist(s),year
0,"""El Paso""",Marty Robbins,1960
1,"""Running Bear""",Johnny Preston,1960
2,"""Teen Angel""",Mark Dinning,1960
3,"""Theme from A Summer Place""",Percy Faith,1960
4,"""Stuck on You""",Elvis Presley,1960
5,"""Cathy's Clown""",The Everly Brothers,1960
6,"""Everybody's Somebody's Fool""",Connie Francis,1960
7,"""Alley Oop""",Hollywood Argyles,1960
8,"""I'm Sorry""",Brenda Lee,1960
9,"""Itsy Bitsy Teenie Weenie Yellow Polkadot Bikini""",Brian Hyland,1960


In [18]:
# A list of songs that have been recorded by multiple artists
# and made it to Billboards Hot 100
data[data.duplicated(subset='Song', keep=False)]

Unnamed: 0,Song,Artist(s),year
8,"""I'm Sorry""",Brenda Lee,1960
38,"""Please Mr. Postman""",The Marvelettes,1961
52,"""The Loco-Motion""",Little Eva,1962
59,"""Go Away Little Girl""",Steve Lawrence,1963
129,"""My Love""",Petula Clark,1966
136,"""When a Man Loves a Woman""",Percy Sledge,1966
150,"""You Keep Me Hangin' On""",The Supremes,1966
152,"""Good Vibrations""",The Beach Boys,1966
176,"""Honey""",Bobby Goldsboro,1968
205,"""Venus""",Shocking Blue,1970


In [19]:
data.shape

(828, 3)

All of the song names in the 'Song' column have a leading and trailing quotation mark. As a final step, let's remove those, then I can write this dataset to a csv file and move it to the data directory.

In [20]:
# Removing the quotation marks around the song title
data['Song'] = data['Song'].str.strip('"')

In [21]:
data.head()

Unnamed: 0,Song,Artist(s),year
0,El Paso,Marty Robbins,1960
1,Running Bear,Johnny Preston,1960
2,Teen Angel,Mark Dinning,1960
3,Theme from A Summer Place,Percy Faith,1960
4,Stuck on You,Elvis Presley,1960


In [22]:
data.to_csv('../data/hot100_60-99.csv')