In [10]:
import json
import pandas as pd

with open('data/nypl_all.json', 'r') as file:
    data = json.load(file)
df = pd.DataFrame(data)

In [11]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm

headless = False
# Initialize Edge driver
if headless:
    options = Options()
    options.add_argument("--headless=new")  # Use "--headless=new" for newer Edge versions
    driver = webdriver.Edge(options=options)
else:
    driver = webdriver.Edge()
wait = WebDriverWait(driver, 10)

unable_to_scrape = []
library_hours = {}

# Wrap the main scraping loop with tqdm progress bar
for id, url in tqdm(df[['id', 'URL']].values.tolist()[0:10], desc='Scraping library pages'):
    driver.get(url)
    try:
        condition = EC.any_of(
            EC.visibility_of_element_located((By.XPATH, "//h2[text()='Regular Hours']")),
            EC.visibility_of_element_located((By.XPATH, "//p[@data-testid='ds-text' and contains(text(), 'Temporarily Closed')]")),
            EC.visibility_of_element_located((By.XPATH, "//button[@id='tabs--r59kd5t6---tab-1' and normalize-space(text())='Upcoming Hours']"))
        )
        header_element = wait.until(condition)

        if header_element.text.strip() == 'Regular Hours':
            parent_div = header_element.find_element(By.XPATH, "./following-sibling::table[1]")
            rows = parent_div.find_elements(By.TAG_NAME, 'tr')

            hours_data = {}
            for row in rows[1:]:
                cols = row.find_elements(By.TAG_NAME, 'th') + row.find_elements(By.TAG_NAME, 'td')
                day = cols[0].text if len(cols) > 0 else ''
                hours = cols[1].text if len(cols) > 1 else ''
                hours_data[day] = hours
            library_hours[id] = hours_data
        
        elif header_element.text.strip() == 'Temporarily Closed':
            library_hours[id] = {'Monday': 'Temporarily Closed', 'Tuesday': 'Temporarily Closed', 'Wednesday': 'Temporarily Closed',
                                 'Thursday': 'Temporarily Closed', 'Friday': 'Temporarily Closed', 'Saturday': 'Temporarily Closed',
                                 'Sunday': 'Temporarily Closed'}
        
        elif header_element.text.strip() == 'Upcoming Hours':
            upcoming_tab_panel = wait.until(
                EC.visibility_of_element_located((By.ID, "tabs--r59kd5t6---tabpanel-1"))
            )
            table = upcoming_tab_panel.find_element(By.CSS_SELECTOR, "table.css-fvtdov")
            rows = table.find_elements(By.CSS_SELECTOR, "tbody tr")
            schedule_data = {}
            for row in rows:
                day = row.find_element(By.CSS_SELECTOR, "th p[data-testid='ds-text']").text.strip()
                date = row.find_element(By.CSS_SELECTOR, "td:nth-of-type(1) p[data-testid='ds-text']").text.strip()
                hours = row.find_element(By.CSS_SELECTOR, "td:nth-of-type(2) p[data-testid='ds-text']").text.strip()
                schedule_data[day] = hours
            library_hours[id] = schedule_data

    except Exception:
        unable_to_scrape.append(url)

driver.quit()


Scraping library pages: 100%|██████████| 10/10 [00:25<00:00,  2.52s/it]


In [12]:
unable_to_scrape

[]

In [13]:
library_hours

{'125th-street': {'Monday': '11 AM–7 PM',
  'Tuesday': '10 AM–6 PM',
  'Wednesday': '10 AM–6 PM',
  'Thursday': '11 AM–7 PM',
  'Friday': '10 AM–5 PM',
  'Saturday': '10 AM–5 PM',
  'Sunday': 'Closed'},
 '53rd-street': {'Monday': '11 AM–6 PM',
  'Tuesday': '11 AM–6 PM',
  'Wednesday': '11 AM–6 PM',
  'Thursday': '11 AM–6 PM',
  'Friday': '11 AM–6 PM',
  'Saturday': '11 AM–6 PM',
  'Sunday': 'Closed'},
 '58th-street': {'Monday': '10 AM–6 PM',
  'Tuesday': '10 AM–6 PM',
  'Wednesday': '10 AM–6 PM',
  'Thursday': '10 AM–6 PM',
  'Friday': '10 AM–5 PM',
  'Saturday': '10 AM–5 PM',
  'Sunday': 'Closed'},
 '67th-street': {'Monday': '11 AM–7 PM',
  'Tuesday': '11 AM–7 PM',
  'Wednesday': '11 AM–7 PM',
  'Thursday': '11 AM–7 PM',
  'Friday': '10 AM–5 PM',
  'Saturday': '10 AM–5 PM',
  'Sunday': 'Closed'},
 '96th-street': {'Monday': '10 AM–7 PM',
  'Tuesday': '10 AM–7 PM',
  'Wednesday': 'CLOSED',
  'Thursday': '10 AM–7 PM',
  'Friday': '10 AM–5 PM',
  'Saturday': '10 AM–5 PM',
  'Sunday': 'CLO

In [14]:
df

Unnamed: 0,id,name,address_line1,address_line2,locality,administrative_area,postal_code,__typename,Name,URL,Address,Zip,Phone,Accessibility,Todays Hours,Map Link
0,125th-street,125th Street Library 10035,224 East 125th Street,224 East 125th Street,New York,NY,10035,RefineryLocation,125th Street Library 10035,https://www.nypl.org/locations/125th-street,"224 East 125th Street\nNew York, NY 10035",10035,212-534-5050,Fully Accessible,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...
1,53rd-street,53rd Street Library 10019,18 West 53rd Street,18 West 53rd Street,New York,NY,10019,RefineryLocation,53rd Street Library 10019,https://www.nypl.org/locations/53rd-street,"18 West 53rd Street\nNew York, NY 10019",10019,212-714-8400,Fully Accessible,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...
2,58th-street,58th Street Library 10022,127 East 58th Street,127 East 58th Street,New York,NY,10022,RefineryLocation,58th Street Library 10022,https://www.nypl.org/locations/58th-street,"127 East 58th Street\nNew York, NY 10022",10022,212-759-7358,Partially Accessible : All parts of the librar...,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...
3,67th-street,67th Street Library 10065,328 East 67th Street,328 East 67th Street,New York,NY,10065,RefineryLocation,67th Street Library 10065,https://www.nypl.org/locations/67th-street,"328 East 67th Street\nNew York, NY 10065",10065,212-734-1717,Fully Accessible,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...
4,96th-street,96th Street Library 10128,112 East 96th Street,112 East 96th Street,New York,NY,10128,RefineryLocation,96th Street Library 10128,https://www.nypl.org/locations/96th-street,"112 East 96th Street\nNew York, NY 10128",10128,212-289-0908,Fully Accessible,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,west-new-brighton,West New Brighton Library 10310,976 Castleton Avenue,976 Castleton Avenue,Staten Island,NY,10310,RefineryLocation,West New Brighton Library 10310,https://www.nypl.org/locations/west-new-brighton,"976 Castleton Avenue\nStaten Island, NY 10310",10310,718-442-1416,Fully Accessible,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...
124,westchester-square,Westchester Square Library 10461,2521 Glebe Avenue,2521 Glebe Avenue,Bronx,NY,10461,RefineryLocation,Westchester Square Library 10461,https://www.nypl.org/locations/westchester-square,"2521 Glebe Avenue\nBronx, NY 10461",10461,718-863-0436,Today's Hours:,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...
125,woodlawn-heights,Woodlawn Heights Library 10470,4355 Katonah Avenue,4355 Katonah Avenue,Bronx,NY,10470,RefineryLocation,Woodlawn Heights Library 10470,https://www.nypl.org/locations/woodlawn-heights,"4355 Katonah Avenue\nBronx, NY 10470",10470,718-519-9627,Fully Accessible,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...
126,woodstock,Woodstock Library 10456,761 East 160th Street,761 East 160th Street,Bronx,NY,10456,RefineryLocation,Woodstock Library 10456,https://www.nypl.org/locations/woodstock,"761 East 160th Street\nBronx, NY 10456",10456,718-665-6255,Fully Accessible,,http://maps.google.com/maps?f=q&hl=en&saddr=&d...


In [15]:
for id, hours in library_hours.items():
    for day_of_week in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
        df.loc[df['id'] == id, day_of_week] = hours[day_of_week]

In [16]:
df.to_excel('data.xlsx', index=False)

In [17]:
import os
os.startfile('data.xlsx')

In [18]:
# df.to_json('data/data.json', orient="records", indent=4)