In [7]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>")) #reduce margins in jupyter notebook

# Scrape the top 100 lists
From http://billboardtop100of.com/

In [2]:
# 'http://billboardtop100of.com/' + year + '-2/' -- this is our lovely, consistent url format

In [8]:
from bs4 import BeautifulSoup
from pprint import pprint

import requests
import random
import numpy as np
import pandas as pd
import re
import time

In [9]:
start = 1960 #somewhat arbitrary date cutoff because scraping dates before this is /much/ more time consuming
stop = 2013 #exclusive, our dataset was released in 2011 and last updated in 2012 as far as I can tell

years = np.arange(start,stop,1)
ranks = np.arange(1,101,1)

In [10]:
def get_top_100(year):

    #get our soup
    url = "http://" + 'billboardtop100of.com/' + str(year) + '-2/'
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # extract the only table (woo!) from the page
    table = soup.find('table')

    results = {} 
    info = [] 
    # each <td> is buried in a <tr>
    for tr in table.find_all('tr'): 
        for td in tr.find_all('td'):
            info.append(td.text)
            
    # dictionary comprehension uses first value (rank) as key, then next two (artist, title) as vals
    for i in range(0, len(info), 3): 
            results[info[i]] = (info[i+1], info[i+2])
        
    dict_list = []
    for k in results.keys(): # reformatting our dictionary into something pandas can eat because I'm a bad zookeeper
        rating = k
        artist = results[k][0]
        song = results[k][1]
        
        # store the newly formatted dictionary
        temp_dict = {'Rating': rating,
                     'Artist' : artist,
                     'Song' : song,
                     'Year' : str(year)}
        # add the temporary dict to a list of dictionaries
        dict_list.append(temp_dict)
        
    # move our list of dicts into a dataframe
    df = pd.DataFrame(dict_list)
    return df

In [11]:
historical_top_100 = pd.DataFrame()

filler_db = pd.DataFrame(np.nan,index=np.arange(100),columns=('Rating','Artist','Song','Year')) # so that we don't error out when appending dfs for weird years

for year in years: # loop through our list of years to scrape all pages
    try:
        historical_top_100 = historical_top_100.append((get_top_100(year)))
        #pprint(historical_top_100)
    except: # if we fail, instead append an empty db so that we know which years failed and can fill in later
        d = {'Rating':np.nan, 'Artist':np.nan, 'Song':np.nan, 'Year':year }
        filler_db = pd.DataFrame(data=d, index=np.arange(100))
        #print(filler_db)
        historical_top_100.join(filler_db)
        continue
        
    delay = random.randint(1, 4) # add delay to make this look human-like
    time.sleep(delay)

In [12]:
pprint(historical_top_100) # verify we extracted information correctly, should be ((stop-start)*100 columns)

                                     Artist Rating  \
0                               Percy Faith      1   
1                                Jim Reeves      2   
2                           Everly Brothers      3   
3                            Johnny Preston      4   
4                              Mark Dinning      5   
5                                Brenda Lee      6   
6                             Elvis Presley      7   
7                               Jimmy Jones      8   
8                             Elvis Presley      9   
9                            Chubby Checker     10   
10                           Connie Francis     11   
11                             Bobby Rydell     12   
12                            Brothers Four     13   
13                               Jack Scott     14   
14                            Marty Robbins     15   
15                        Hollywood Argyles     16   
16                           Connie Francis     17   
17                          

In [13]:
historical_top_100.to_csv('historical_top_100.csv') # export our csv

In [14]:
hist = pd.DataFrame.from_csv('/Users/horisustar/projects/project2backup/historical_top_100.csv',sep=',') # check our csv

  """Entry point for launching an IPython kernel.


In [15]:
hist.head() # looks good!


Unnamed: 0,Artist,Rating,Song,Year
0,Percy Faith,1,Theme From “A Summer Place”,1960
1,Jim Reeves,2,He’ll Have To Go,1960
2,Everly Brothers,3,Cathy’s Clown,1960
3,Johnny Preston,4,Running Bear,1960
4,Mark Dinning,5,Teen Angel,1960
