# 0: Import libraries

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from collections import OrderedDict

# 1: Get the link for all shows

## 1.1: Define a function to get the links of all pages for an artist
##### Only 10 shows are displayed per page, so we need to get url for all pages first 

In [68]:
def grab_all_page_links(artist_name):

    page_links = list()
    if ' ' in artist_name:
        artist_name = artist_name.replace(" ", "%20")        
    url = "https://www.setlist.fm/search?query=" + artist_name
    response = requests.get(url)
    
    if response.status_code != 200:
        print("Failed to connect with website")
    
    results_page = BeautifulSoup(response.content, 'lxml')
    
    total_num_pages =  results_page.find_all('div', class_='col-xs-12 noTopBorder noTopPadding hidden-print text-center listPager-lg')
    li_list = total_num_pages[0].find_all('li')
    last_page_text = int(li_list[8].get_text())
    
    for num in range(2,last_page_text + 1 ):
                show_page = 'https://www.setlist.fm/search?page=' + str(num) + '&query=' + artist_name
                page_links.append(show_page)
    
    page_links.insert(0, url)
        
    return page_links

## 1.2: Define a function to get the links for all shows
##### We iterate on all pages and for each page we extract the url of all shows, then we put them in a list 

In [71]:
def create_list_show_links(page_links):
    
    show_links_list = list()
    
    for link in page_links:
        response = requests.get(link)
        if response.status_code != 200:
            print("Failed to connect with website")
        results_page = BeautifulSoup(response.content, 'lxml')
        try:
            messy_list_show_links = results_page.find_all('div', class_='row contentBox visiblePrint')[0].find_all('h2')
        except:
            pass

        try:
            for show in messy_list_show_links:    
                try:
                    raw_url = show.find('a').get('href')
                    raw_url = raw_url.lstrip("..") 
                    actual_url = 'https://www.setlist.fm/'+ raw_url 
                    show_links_list.append(actual_url)
                except:
                    pass
        except:
            pass
    
    return show_links_list

In [74]:
list_of_shows = create_list_show_links(grab_all_page_links('billy joel'))

# 2: Extract info for shows and populate a dataframe

## 2.1: Define a function to get show info given a link
##### We scrap the data and put it in a list

In [79]:
def get_all_show_info(url):
    
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(response.status_code)
        results_page = BeautifulSoup(response.content, 'lxml')
    except requests.Timeout as e:
        print("It is time to timeout")
        print(str(e))
        pass
    
    all_show_info = list()
    
    #Part 1 -- Get the date 
    try:
        full_date = results_page.find_all('div', class_="breadCrumbBar")[0].find_all('span')[-1].get_text().rstrip('Setlist')
        month, day, year = full_date.strip().replace(",","").split(" ")
    except:
        pass
    
    #Create a tuple of the date info and add it to the all_show_info list
    try:
        date_tuple = (month, day, year)
        all_show_info.append(date_tuple)
    except:
        pass
    
    #Part 2 -- Get the location info
    try:
        header_info = results_page.find('h1').find_all('span')
    except:
        pass
    
    try:
        location_info = header_info[3].find('span').get_text()
    except:
        pass
    
    try:
        venue, city, state, country = location_info.split(",")
    except:
        pass
    
    try:
        location_tuple = (venue, city, state, country) 
        all_show_info.append(location_tuple)
    except:
        pass
        
    #Part 3 -- Get the songs
    setlist = list()
    try:
        set_list_list = results_page.find_all('a', 'songLabel')
        for song in set_list_list:
            setlist.append(song.get_text())
   
        setlist_tuple = tuple(setlist)
        all_show_info.append(setlist_tuple)
    
    except:
        pass
    
    return all_show_info

## 2.2: Build an empty dataframe
##### We list all songs and make columns named after them

In [116]:
all_songs = list()
for show in list_of_shows:
    set_list = get_all_show_info(show)[-1]
    all_songs.extend(set_list)

In [125]:
all_songs_unique = list(set(all_songs))

In [190]:
column_list = ['Year','Month','Day','Venue','City','State','Country'] + all_songs_unique
df = pd.DataFrame(columns = column_list)

## 2.3: Populate the dataframe
##### We iterate on the list of show links and fill the dataframe, with a 1 in the corresponding column if a song has been played during that show (else we set the value to 0) 

In [None]:
for show_num in range(len(list_of_shows)):
    all_show_info = get_all_show_info(list_of_shows[show_num])
    df = df.append({'Year': 0}, ignore_index=True)
    df['Year'][show_num] = all_show_info[0][2]
    df['Month'][show_num] = all_show_info[0][0]
    df['Day'][show_num] = all_show_info[0][1]
    if len(all_show_info) == 3:
        df['Venue'][show_num] = all_show_info[1][0]
        df['City'][show_num] = all_show_info[1][1]
        df['State'][show_num] = all_show_info[1][2]
        df['Country'][show_num] = all_show_info[1][3]
    for column in all_songs_unique:
        df[column][show_num] = 0
    for song in all_show_info[-1]:
        df[song][show_num] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#i

In [186]:
df

Unnamed: 0,Year,Month,Day,Venue,City,State,Country,Anchors Aweigh,Gimme Some Lovin',Whole Lotta Love,...,Angels We Have Heard On High,Pop Goes the Weasel / Circus Music / Angelina / Zooma Zooma / Pop Goes the Weasel,The Mexican Connection,Bohemian Rhapsody,Pressure,The Night Is Still Young,Honky Cat,Somewhere Along the Line,Me and Julio Down by the Schoolyard,Sherry / Unchained Melody / Speedoo / The Lion Sleeps Tonight
0,2018.0,November,10.0,Madison Square Garden,New York,NY,USA,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018.0,October,27.0,Madison Square Garden,New York,NY,USA,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,,,,,,,,,,...,,,,,,,,,,
