# An Analysis of the WMJI Majic 105.7 Top 500 Songs of 1995

**Abstract:** the results of an exploratory analysis of the "Majic Number" list of top 500 songs as found on the April 28, 1998 snapshot of the WMJI radio station website.

## Fetch the Data

In [1]:
from bs4 import BeautifulSoup
import requests
import pickle
from datetime import datetime
import sys
import os

In [2]:
# Toggle this to False if you need to fetch the
# data for the first time
useCache = True

In [3]:
# The root web page from which to download the data set
url = 'http://web.archive.org/web/19990428111902/http://www.wmji.com/fullsite/top500/top500.html'

In [4]:
def writeCache(listOfBeautifulSoups, timestamp, cachePrefix='',
              cacheSuffix='.pickle' ):
    '''writeCache(): ListOfBeautifulSoup datetime.datetime Str1 Str2 -> <File>
    Purpose: to write the LisfOfBeautifulSoup to a cache file in pickle
             format. datetime.datetime is the timestamp (in UTC) at 
             which the data were fetched, and is written into the name 
             of <file>. Optional keywords cachePrefix and cacheSuffix
             set the start and end of the cache file's name.
    '''
    path = cachePrefix + str(datetime.utcnow()).replace(' ','T') + cacheSuffix
    with open(path, 'wb') as rawCache:
        defaultRecLimit = sys.getrecursionlimit()
        sys.setrecursionlimit(10**6)
        pickle.dump(listOfBeautifulSoups,rawCache)
        sys.setrecursionlimit(defaultRecLimit)

In [5]:
def readCache(path):
    '''readCache(): [Str] --> ListOfSoups
    Purpose: to read in the cached copy of listOfSoups from 
             its pickled cache file. Optional Str argument "path" 
             is alternative path to the cache file.
    '''
    with open(path,'rb') as rawCache:
        listOfSoups = pickle.load(rawCache)
    
    return listOfSoups

In [6]:
def findLatestCache(directory='./',cachePrefix='',
                   cacheSuffix='.pickle'):
    '''findLatestCache(): [Str1] [Str2]-> Str
    Purpose: to find the latest cache file in the current working
             directory or in the specified PathToDirectory, and 
             return the path of that file as a Str.
    '''
    caches = list()
    
    for dirpath, dirnames, filenames in os.walk(directory):
        for file in filenames:
            if cachePrefix in file and cacheSuffix in file:
                caches.append(file)
                
    caches.sort()
    
    try:
        return caches[-1]
    except IndexError as e:
        raise ValueError("No cache file found.")

In [7]:
if useCache:
    listOfSoups = readCache(findLatestCache())
else:
    # Fetch the initial URL
    response = requests.get(url)
    # Parse the result and enumerate links to follow
    rootSoup = BeautifulSoup(response.content.decode())
    # Collect the URLs to the data set
    links = [ link for link in rootSoup.find_all('a') if "songsnumberpt" in link['href'] ]
    # Collect and parse the pages at each link
    listOfSoups = list()

    rootPage = url.split('/')
    del rootPage[-1]
    rootPage = '/'.join(rootPage)+'/'

    fetchTime = datetime.utcnow()

    for link in links:
        thisLink = rootPage+link['href']
        response = requests.get(thisLink)
        soup = BeautifulSoup(response.content.decode())
        listOfSoups.append(soup)
    
    writeCache(listOfSoups,fetchTime)

In [None]:
# Retrieve data from the songs table on each page
pages = list()
for page in listOfSoups:
    # There's only one table on each page
    table = page.find_all('table')[0]
    rows = list()
    for element in table.contents:
        if element == "\n":
            continue
        elif element.find_all('th'):
            headers = element.find_all('th')
            fields = [header.text for header in headers]
        else:
            data = element.find_all('tr')
            for row in data:
                values = { datum.text for datum in row }
                rows.append(values)
    
    pages.append({ 'fields' : fields,
                   'values' : rows})

In [33]:
# LEFT OFF HERE. Something's amiss with parsing the table.
# When you just table.contents, you get nice plaintext rows.
# When you try to parse each row, you get three times the number
# of rows you expect
mypage = listOfSoups[0]
table = mypage.find_all('table')[0]
for item in table:
    print(type(item))

<class 'bs4.element.NavigableString'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>


In [13]:
len(logicalRows)

126