In [118]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import numpy as np
import pandas as pd

In [77]:
driver = webdriver.Chrome()

In [78]:
driver.get("https://www.playbill.com/production/gross?production=00000150-aea5-d936-a7fd-eef572240001")

In [79]:
col0 = driver.find_elements(by = By.CLASS_NAME, value = "col-0")
col1 = driver.find_elements(by = By.CLASS_NAME, value = "col-3")
col2 = driver.find_elements(by = By.CLASS_NAME, value = "col-4")
col3 = driver.find_elements(by = By.CLASS_NAME, value = "col-6")

In [80]:
def check_elements(iter1):
    for e in iter1:
        print(e.text)
    print(len(iter1))

In [63]:
# # For checking the structuring of the elements which have been scraped
# check_elements(col0)
# check_elements(col1[1:])
# check_elements(col2[1:])
# check_elements(col3[1:])

Conclusion: 
1. col0 starts with column name and continues with column values. Can be used as is, except that column header should be sliced out.
2. col1 starts with a column name, which is, however, a wrong label for the data, the html source for the website has repeated a class "col-1" twice. Thus, the column is actually "Avg Ticket - Top Ticket" and the text under it should be split into 2 different columns.
3. col2 is similar to col1, and the column name is also wrong, it should be - "Seats sold - Seats in theatre." The second column value is seen to remain mostly constant and might be truncated because it doesn't provide any additional insight.

Added Edit:
4. col3 is the % of seats filled (% cap), however, is wrongly labelled as perfs.
5. col2 "Seats in theatre" is seen to not remain constant after a large number of weeks.

In [81]:
i = 1
dfcol1 = [] # Avg Ticket
dfcol2 = [] # Top Ticket
dfcol3 = [] # Seats sold
dfcol4 = [] # Seats in theatre
for x, y in zip(col1[1:], col2[1:]):
    l1, l2 = x.text.split('\n'), y.text.split('\n')
    dfcol1.append(l1[0])
    dfcol2.append(l1[1])
    dfcol3.append(l2[0])
    dfcol4.append(l2[1])

In [87]:
next_page = driver.find_element(by = By.CLASS_NAME, value = "bsp-pagination-navigate-next")

In [88]:
next_page.click()

In [89]:
col0 = driver.find_elements(by = By.CLASS_NAME, value = "col-0")
col1 = driver.find_elements(by = By.CLASS_NAME, value = "col-3")
col2 = driver.find_elements(by = By.CLASS_NAME, value = "col-4")
col3 = driver.find_elements(by = By.CLASS_NAME, value = "col-6")

In [None]:
# For checking the structuring of the elements which have been scraped
check_elements(col0)
check_elements(col1[1:])
check_elements(col2[1:])
check_elements(col3[1:])

In [67]:
driver.quit()

In [114]:
def collect_data(df_list):
    dfcol0 = df_list[0] # Week Ending
    dfcol1 = df_list[1] # Avg Ticket
    dfcol2 = df_list[2] # Top Ticket
    dfcol3 = df_list[3] # Seats sold
    dfcol4 = df_list[4] # Seats in the Theatre
    dfcol5 = df_list[5] # % Cap

    driver = webdriver.Chrome()
    data_link = "https://www.playbill.com/production/gross?production=00000150-aea5-d936-a7fd-eef572240001"
    driver.get(data_link)

    while (True):
        col0 = driver.find_elements(by = By.CLASS_NAME, value = "col-0")
        col1 = driver.find_elements(by = By.CLASS_NAME, value = "col-3")
        col2 = driver.find_elements(by = By.CLASS_NAME, value = "col-4")
        col3 = driver.find_elements(by = By.CLASS_NAME, value = "col-6")

        for e0, e1, e2, e3 in zip(col0[1:], col1[1:], col2[1:], col3[1:]):
            l1 = e1.text.split('\n')
            l2 = e2.text.split('\n')
            dfcol0.append(e0.text)
            dfcol1.append(l1[0])
            # Top ticket values don't exist for some pages
            if (len(l1) == 2):
                dfcol2.append(l1[1])
            else:
                dfcol2.append(np.nan)
            dfcol3.append(l2[0])
            dfcol4.append(l2[1])
            dfcol5.append(e3.text)

        # Next button element doesn't load sometimes and leads to premature ending of data collection program
        driver.implicitly_wait(5)
        try:
            next_page = driver.find_element(by = By.CLASS_NAME, value = "bsp-pagination-navigate-next")
        except NoSuchElementException:
            print("Finished collecting data!")
            driver.quit()
            break
        except Exception as e:
            print(f"An exception {e} occurred, ceasing process.")
            driver.quit()
            break
        next_page.click()

    return df_list


In [115]:
df_list = [[], [], [], [], [], []]
collect_data(df_list)

Finished collecting data!


[['SEP 10, 2023',
  'SEP 3, 2023',
  'AUG 27, 2023',
  'AUG 20, 2023',
  'AUG 13, 2023',
  'AUG 6, 2023',
  'JUL 30, 2023',
  'JUL 23, 2023',
  'JUL 16, 2023',
  'JUL 9, 2023',
  'JUL 2, 2023',
  'JUN 25, 2023',
  'JUN 18, 2023',
  'JUN 11, 2023',
  'JUN 4, 2023',
  'MAY 28, 2023',
  'MAY 21, 2023',
  'MAY 14, 2023',
  'MAY 7, 2023',
  'APR 30, 2023',
  'APR 23, 2023',
  'APR 16, 2023',
  'APR 9, 2023',
  'APR 2, 2023',
  'MAR 26, 2023',
  'MAR 19, 2023',
  'MAR 12, 2023',
  'MAR 5, 2023',
  'FEB 26, 2023',
  'FEB 19, 2023',
  'FEB 12, 2023',
  'FEB 5, 2023',
  'JAN 29, 2023',
  'JAN 22, 2023',
  'JAN 15, 2023',
  'JAN 8, 2023',
  'JAN 1, 2023',
  'DEC 25, 2022',
  'DEC 18, 2022',
  'DEC 11, 2022',
  'DEC 4, 2022',
  'NOV 27, 2022',
  'NOV 20, 2022',
  'NOV 13, 2022',
  'NOV 6, 2022',
  'OCT 30, 2022',
  'OCT 23, 2022',
  'OCT 16, 2022',
  'OCT 9, 2022',
  'OCT 2, 2022',
  'SEP 25, 2022',
  'SEP 18, 2022',
  'SEP 11, 2022',
  'SEP 4, 2022',
  'AUG 28, 2022',
  'AUG 21, 2022',
  'AUG 14

In [120]:
print("Week Endings:")
print(df_list[0][-5:-1])
print("Avg Tickets:")
print(df_list[1][-5:-1])
print("Top Tickets:")
print(df_list[2][-5:-1])
print("Seats sold:")
print(df_list[3][-5:-1])
print("Seats in theatre:")
print(df_list[4][-5:-1])
print("% Cap:")
print(df_list[5][-5:-1])
print("")
for i in range(5):
    print(len(df_list[i]))
print(len(df_list))

Week Endings:
['NOV 16, 1997', 'NOV 9, 1997', 'NOV 2, 1997', 'OCT 26, 1997']
Avg Tickets:
['$56.52', '$56.42', '$56.75', '$58.12']
Top Tickets:
['$75.00', '$75.00', '$75.00', '$75.00']
Seats sold:
['11,397', '13,372', '11,811', '12,460']
Seats in theatre:
['1,745', '1,745', '1,745', '1,745']
% Cap:
['81.64%', '95.79%', '84.61%', '89.26%']

1268
1268
1268
1268
1268
6


In [122]:
bway_lk_data = pd.DataFrame({
    "Week Endings": df_list[0],
    "Avg Ticket Price ($)": df_list[1],
    "Top Ticket Price ($)": df_list[2],
    "Seats Sold": df_list[3],
    "Seats in Theatre": df_list[4],
    "% Cap": df_list[5]
})

In [123]:
bway_lk_data.to_csv("bway-lk-data.csv", index=False)