In [135]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from collections import OrderedDict

<h2>Function 1: Get a Show's Info</h2>

In [136]:
def get_all_show_info(url):
    
    #URL request/response cycle  
    response = requests.get(url)
    if response.status_code != 200:
        print("Failure! Could not connect with setlist.fm -- output from Function 1")
    results_page = BeautifulSoup(response.content, 'lxml')
    
    all_show_info = list()
    
    #Part 1 -- Get the date 
    try:
        full_date = results_page.find_all('div', class_="breadCrumbBar")[0].find_all('span')[-1].get_text().rstrip('Setlist')
        month, day, year = full_date.strip().replace(",","").split(" ")
    except:
        pass
    
    #Create a tuple of the date info and add it to the all_show_info list
    try:
        date_tuple = (month, day, year)
        all_show_info.append(date_tuple)
    except:
        pass
    
    #Part 2 -- Get the location info
    try:
        header_info = results_page.find('h1').find_all('span')
    except:
        pass
    
    try:
        location_info = header_info[3].find('span').get_text()
    except:
        pass
    
    try:
        venue, city, state, country = location_info.split(",")
    except:
        pass
    
    try:
        location_tuple = (venue, city, state, country) 
        all_show_info.append(location_tuple)
    except:
        pass
        
    #Part 3 -- Get the songs
    setlist = list()
    try:
        set_list_list = results_page.find_all('a', 'songLabel')
        for song in set_list_list:
            setlist.append(song.get_text())
   
        setlist_tuple = tuple(setlist)
        all_show_info.append(setlist_tuple)
    
    except:
        pass
    
    return all_show_info

<h4> Test out the function -- get_all_show_info</h4>

In [137]:
show_data = get_all_show_info('https://www.setlist.fm/setlist/grateful-dead/1984/san-francisco-civic-auditorium-san-francisco-ca-53d607d9.html')
print(show_data)


[('December', '31', '1984'), ('San Francisco  Civic Auditorium', ' San Francisco', ' CA', ' USA'), ('Shakedown Street', 'Minglewood Blues', 'Peggy-O', 'Jack Straw', 'Bird Song', 'Hell in a Bucket', "Don't Ease Me In", 'Sugar Magnolia', 'Scarlet Begonias', 'Fire on the Mountain', 'Man Smart, Woman Smarter', 'Drums', 'Space', 'Spanish Jam', 'The Wheel', 'Throwing Stones', 'Turn On Your Love Light', "Gimme Some Lovin'", "Uncle John's Band", 'Around and Around', "It's All Over Now, Baby Blue")]


In [138]:
show_data_output = get_all_show_info('https://www.setlist.fm/setlist/grateful-dead/1984/san-francisco-civic-auditorium-san-francisco-ca-53d607d9.html')


<h2>Function 2: Add a show's info and setlist to a new dataframe</h2>

In [147]:
def add_all_show_info_to_df(show_data_output):
    from collections import OrderedDict
    
    try:
        show_month = show_data_output[0][0]
    except:
        pass
    
    try:
        show_day = show_data_output[0][1]
    except:
        pass
    
    try:
        show_year = show_data_output[0][2]
    except:
        pass
    
    try:
        show_venue = show_data_output[1][0] 
    except:
        pass
    
    try:
        show_city = show_data_output[1][1]
    except:
        pass
    
    try:
        show_state = show_data_output[1][2]
    except:
        pass
    
    try:
        show_country = show_data_output[1][3]
    except:
        pass
    
    show_as_index = "%s-%s-%s" % (show_year, show_month, show_day)
    
    try:
        show_dict = {
    'Index': [show_as_index], 'Year': [show_year], 'Month': [show_month], 'Day': [show_day],
    'Venue': [show_venue], 'City': [show_city], "State": [show_state], "Country": [show_country]}
    except:
        pass
    
    try:
        set_list = show_data_output[2]
        song_dict = OrderedDict() #Create an ordered dict to hold the songs/setlist

        for number, song_title in enumerate(set_list):
            key = number+1 #This ensures the track listing starts at 1 instead of 0
            pretty_key = "Song" + str(key) 
            value = song_title
            song_dict[pretty_key] = value #Add each key, value to the ordered dict where key=Song# and value = track name

        show_dict.update(song_dict) #Append/add the songs to the show dict
    
    except:
        pass
    
    try:
        show_dict
    except:
        column_list = ['Index','Year','Month','Day','Venue','City','State','Country' ]
        df = pd.DataFrame(columns = column_list)
    else: 
        df = pd.DataFrame(show_dict)
    
    return df

<h2>Function 3: Get a show's info and create a dataframe of it</h2>

In [148]:
def get_show_info_and_make_df(show_url):
    show_info_as_list = get_all_show_info(str(show_url))
    show_as_df = add_all_show_info_to_df(show_info_as_list)
    return show_as_df
    


<h2>Function 4: Combine the first and subsequent show DFs</h2>

In [149]:
def add_more_shows(first_show_df, second_show_df):
    combo_df = first_show_df.append(second_show_df, ignore_index=True, sort=False)
    return combo_df

<h3>##################################################################</h3>

<h2>Function 5: Search for an artist and return a link to their most recent shows</h2>

In [269]:
def find_an_artist():    
    artist_name = input(" Please enter the name of an artist")
    if ' ' in artist_name:
        artist_name = artist_name.replace(" ", "%20")    
    url = "https://www.setlist.fm/search?query=" + artist_name

    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to connect with website -- output from Function 5")

    return url  

<h4> Test out the function</h4>

In [270]:
find_an_artist()

 Please enter the name of an artistbilly joel


'https://www.setlist.fm/search?query=billy%20joel'

<h2>Function 6: Grab first ten shows' URLs and add to a list</h2>

In [326]:
def create_list_of_artist_sets(setlist_link):
    response = requests.get(setlist_link)
    if response.status_code != 200:
        print("Failed to connect with website -- output from Function 6")
        
    
    results_page = BeautifulSoup(response.content, 'lxml')
    messy_list_show_links = results_page.find_all('div', class_='rightColumn col-xs-12 col-md-9')[0].find_all('h2')
    
    show_links_list = list()
    
    counter = 0
    for show in messy_list_show_links:    
        try:
            raw_url = show.find('a').get('href')
            actual_url = 'https://www.setlist.fm/'+ raw_url 
            show_links_list.append(actual_url)
        except:
            pass
    return show_links_list

In [327]:
create_list_of_artist_sets("https://www.setlist.fm/search?query=billy%20joel")

['https://www.setlist.fm/setlist/billy-joel/2018/madison-square-garden-new-york-ny-6b97be66.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/madison-square-garden-new-york-ny-53963335.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/bbandt-field-winston-salem-nc-4396ab87.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/madison-square-garden-new-york-ny-23e91c6f.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/kauffman-stadium-kansas-city-mo-13e9dd61.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/wrigley-field-chicago-il-53e8535d.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/madison-square-garden-new-york-ny-73e8d2e9.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/fenway-park-boston-ma-23e8a49f.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/fenway-center-boston-ma-7be8d620.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/citizens-bank-park-philadelphia-pa-beb2536.html']

<h2>Function 7: Combine the previous two functions. Enter an artist, find their setlist page and add their first page of show URLs to a list</h2>

In [334]:
def find_artist_and_setlists():
    artist_setlist_page = find_an_artist()
    list_of_setlists = create_list_of_artist_sets(artist_setlist_page)
    return list_of_setlists, artist_setlist_page

<h4> Test out the function</h4>

In [336]:
one, two = find_artist_and_setlists()

 Please enter the name of an artistbilly joel


<h2>Function 8: Combine previous two functions. From an artist first page of setlists, get the number of pages of setlists and return a list of links to each show</h2>

In [338]:
two

'https://www.setlist.fm/search?query=billy%20joel'

In [395]:
def grab_all_setlists():
    first_ten_setlists, artist_setlist_page = find_artist_and_setlists()
    response = requests.get(artist_setlist_page)
    
    if response.status_code != 200:
        print("Failure! Could not connect with setlist.fm -- output from Function 8")

    results_page = BeautifulSoup(response.content, 'lxml')

    set_list_page_links = list()
    pagenumber = 2    
    
    total_num_pages =  results_page.find_all('div', class_='col-xs-12 noTopBorder noTopPadding hidden-print text-center listPager-lg')
    li_list = total_num_pages[0].find_all('li')
    last_page_text = int(li_list[8].get_text())
    
    #total_num_pages =  int(results_page.find_all('div', class_='col-xs-12 noTopBorder noTopPadding hidden-print text-center listPager-lg')[0].find_all('li')[-2].get_text())

    if last_page_text > 50:
        print("We've found", last_page_text, "pages worth of music for this artist. That means", last_page_text*10, "shows!")
        user_decision = (input("Are you sure you want to proceed (Y/N)?")).lower()[0]

        if user_decision == "n":
            print("Abort! Abort!")

        if user_decision == "y":
            for num in range(last_page_text):
                show_pages = 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html' + '?page=' + str(pagenumber) 
                set_list_page_links.append(show_pages)
                pagenumber += 1
        
        set_list_page_links.pop()
        
        return first_ten_setlists, set_list_page_links 

In [400]:
the_first_ten, pages_2_on = grab_all_setlists()

 Please enter the name of an artistbilly joel
We've found 166 pages worth of music for this artist. That means 1660 shows!
Are you sure you want to proceed (Y/N)?y


<h2>Function 8: Combine the previous three functions. Enter an artist, find their setlist page and add the first (most recent) 10 shows
    to a dataframe</h2>


In [410]:
pages_2_on

['https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=2',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=3',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=4',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=5',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=6',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=7',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=8',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=9',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=10',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=11',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=12',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=13',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=14',
 'https://www.setlist.fm/setlists/billy-joel-7bd6be40.html?page=15',
 'https://www.setlist.fm/setlists/billy-jo

In [403]:
the_first_ten

['https://www.setlist.fm/setlist/billy-joel/2018/madison-square-garden-new-york-ny-53963335.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/bbandt-field-winston-salem-nc-4396ab87.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/madison-square-garden-new-york-ny-23e91c6f.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/kauffman-stadium-kansas-city-mo-13e9dd61.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/wrigley-field-chicago-il-53e8535d.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/madison-square-garden-new-york-ny-73e8d2e9.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/fenway-park-boston-ma-23e8a49f.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/fenway-center-boston-ma-7be8d620.html',
 'https://www.setlist.fm/setlist/billy-joel/2018/citizens-bank-park-philadelphia-pa-beb2536.html']

In [411]:
def create_master_df():
    the_first_ten, pages_2_on = grab_all_setlists()
    
    all_shows_df = get_show_info_and_make_df(the_first_ten[0])
    del the_first_ten[0]
    
    for show_url in the_first_ten:
        show_df = get_show_info_and_make_df(show_url) #Create a df of the first show in the list (now show #2)
        combo_frame = add_more_shows(all_shows_df, show_df) #combine first and second show dfs into a single df
        all_shows_df = combo_frame #Overwrite the all_shows_df (which is the first show only) with the combo frame which is shows 1 and 2
    
    for page_url in pages_2_on:
        next_ten_shows = create_list_of_artist_sets(show_url)
        
        for show_link in next_ten_show:
            show_data = get_all_show_info(show_url) #Create a df of the first show in the list 
            show_as_df = add_all_show_info_to_df(show_data)
            combo_frame = add_more_shows(all_shows_df, show_as_df)
            all_shows_df = combo_frame

    return all_shows_df 



In [412]:
#create_master_df()

 Please enter the name of an artistbilly joel
Failed to connect with website
Failed to connect with website


IndexError: list index out of range