In [1]:
# Imports for selenium
import re
import requests
import math
import json
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup as bs
from splinter import Browser

def init_browser():   
    # Chrome driver
    executable_path = {"executable_path": "chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

In [2]:
"""
This function is to parse out content using a regular expression pattern from a string.
<param>regex_pattern</param>
<param>string</param>
"""
def parse_definition(regex_pattern, string):
    result = re.compile(regex_pattern, flags=re.MULTILINE|re.DOTALL)
    # Checking if the patten works for the string
    if not result.search(string):
        return "None"
    else:
        return result.search(string).group(1)

In [3]:
def artist_code(artist):
    browser = init_browser()
    # URL for queries
    url = f'https://www.songkick.com/search?utf8=✓&type=initial&query={artist.replace(" ", "+")}'
    browser.visit(url)
    html = browser.html
    soup = bs(html, "html.parser")
    # Find the href for the URL with the page link
    container = soup.find("div", class_="sticky-container")
    row = container.find("div", class_="row")
    col = row.find("div", class_="col-8 primary")
    ul = col.find("ul")
    page_link = ul.find("a", class_="thumb")["href"]
    # Running the function
    artist_code = parse_definition('/artists/(\\d+)-', str(page_link))
    # Returns the artist code 
    return artist_code

In [4]:
# This will create a DB that will be thrown into a MongoDB to use
def concert_info(artist_code):
    # Initialzing Python dictionary
    concert_info = {}
    # Lists to store info to append to dictionary
    base_artist = []
    base_artist.append(artist_code)
    event_number = []
    event_name = []
    popularity = []
    location = []
    longitude = []
    latitude = []
    city = []
    start_date = []
    start_time = []
    for x in range(len(base_artist)):
        base_url = "https://api.songkick.com/api/3.0/artists/"+str(base_artist[x])+"/gigography.json?apikey=ORhDmrgUGkhdnXgP"+"&page=1"
        base_artist_2 = base_artist[x]
        total_entries = requests.get(base_url).json()['resultsPage']['totalEntries']
        pages = total_entries/50
        total_pages = math.ceil(pages)
        counter = 1
        
        for y in range(total_pages):
            artist_url = "https://api.songkick.com/api/3.0/artists/"+str(base_artist_2)+"/gigography.json?apikey="+"ORhDmrgUGkhdnXgP"+"&page="+(str(y+1))
            artist_json = requests.get(artist_url).json()
            for x in artist_json['resultsPage']['results']['event']:
                if str(x['location']['city']).split(",")[-1].strip() == "US" and str(x['venue']['lat']) != "None" and str(x['venue']['lng']) != "None":
                    event_number.append(counter)
                    event_name.append(str(x['displayName']))
                    popularity.append(str(x['popularity']))
                    location.append(str(x['venue']['displayName']))
                    longitude.append(str(x['venue']['lng']))
                    latitude.append(str(x['venue']['lat']))
                    city.append(str(x['location']['city']))
                    start_date.append(str(x['start']['date']))
                    start_time.append(str(x['start']['time']))
                    counter += 1
    artist_pd = pd.DataFrame({
        "Event Number": event_number,
        "Event Name": event_name,
        "Popularity": popularity,
        "Venue": location,
        "Longitude": longitude,
        "Latitude": latitude,
        "City": city,
        "Start Date": start_date,
        "Start Time": start_time
    })
    indexed_pd = artist_pd.set_index("Event Number")
    indexed_pd.to_csv(artist_code + "_events.csv")
    pd_json = json.loads(indexed_pd.reset_index().to_json(orient='records'))
    return pd_json

In [5]:
dream_events = concert_info(artist_code("Dream Theater"))
bruno_events = concert_info(artist_code("Bruno Mars"))
garth_events = concert_info(artist_code("Garth Brooks"))
eminem_events = concert_info(artist_code("Eminem"))
radiohead_events = concert_info(artist_code("Radiohead"))
beyonce_events = concert_info(artist_code("Beyonce"))
kanye_events = concert_info(artist_code("Kanye West"))
mana_events = concert_info(artist_code("Mana"))
odesza_events = concert_info(artist_code("Odesza"))
jcole_events = concert_info(artist_code("J Cole"))
mayer_events = concert_info(artist_code("John Mayer"))
russ_events = concert_info(artist_code("Russ"))
sza_events = concert_info(artist_code("SZA"))
trippie_events = concert_info(artist_code("Trippie Redd"))

In [6]:
trippie_events

[{'City': 'Mountain View, CA, US',
  'Event Name': 'Rolling Loud Festival 2017',
  'Event Number': 1,
  'Latitude': '37.42332',
  'Longitude': '-122.07803',
  'Popularity': '0.649488',
  'Start Date': '2017-10-21',
  'Start Time': 'None',
  'Venue': 'Shoreline Amphitheatre'},
 {'City': 'San Bernardino, CA, US',
  'Event Name': 'Rolling Loud Festival 2017',
  'Event Number': 2,
  'Latitude': '34.08973',
  'Longitude': '-117.29406',
  'Popularity': '0.326749',
  'Start Date': '2017-12-16',
  'Start Time': '13:00:00',
  'Venue': 'NOS Events Center'},
 {'City': 'Minneapolis, MN, US',
  'Event Name': 'Trippie Redd with Tiiiiiiiiiip at Music Hall (January 13, 2018)',
  'Event Number': 3,
  'Latitude': '44.9804803',
  'Longitude': '-93.2744249',
  'Popularity': '0.008772',
  'Start Date': '2018-01-13',
  'Start Time': '19:00:00',
  'Venue': 'Music Hall'},
 {'City': 'Santa Ana, CA, US',
  'Event Name': 'Trippie Redd at The Observatory (January 14, 2018)',
  'Event Number': 4,
  'Latitude': '33