In [10]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

def store_response():
    url = "https://www.bklynlibrary.org/api/locations/v1/map"

    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()  # assuming the response is JSON
            print(data)
        else:
            print("Request failed with status code:", response.status_code)
    except requests.exceptions.RequestException as e:
        print("An error occurred:", e)

    with open('data/responses/brooklyn_locations_map.json', 'w') as f:
        json.dump(response.json(), f, indent=4)

def get_records():
    records = {}
    with open('data/responses/brooklyn_locations_map.json', 'r') as f:
        response = json.load(f)
    for library in response:
        library['id'] = library['path'].split('/')[-1]
        records[library['id']] = library
    # df = pd.DataFrame(response)
    # df['id'] = df['path'].str.split('/').str[-1]
    # df.insert(0, 'id', df.pop('id'))
    return records

# def get_records():
#     df = response_to_df()
#     return df.to_dict(orient='records')

def get_urls():
    with open('data/responses/brooklyn_locations_map.json', 'r') as f:
        response = json.load(f)
    urls = []
    for lib in response:
        if lib['status'] == '0':
            urls.append(lib['path'])
    return urls

def selenium_scrape_active_hours(records, headless):
    options = Options()
    if headless:
        options.add_argument('--headless')
    driver = webdriver.Edge(options=options)
    to_scrape = {}
    try:
        for id, record in tqdm(records.items(), desc="Scraping URLs"):
            if record.get('status') != '0':
                continue
            url = record.get('path')
            if url == "https://www.bklynlibrary.org/locations/annex":
                continue
            driver.get(url)
            try:
                wait = WebDriverWait(driver, 8)
                element = wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "table.hours-table"))
                )
                outer_html = element.get_attribute("outerHTML")
                to_scrape[id] = outer_html  # Corrected here: record['id']
            except Exception as e:
                print(f"Element not found for {url}: {e}")
                to_scrape[id] = ''  # Store empty string if element not found
    finally:
        driver.quit()
    return to_scrape

def scrape_active_hours(html):
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', class_='hours-table')

    active_hours = {}
    for row in table.find_all('tr'):
        cells = row.find_all('td')
        day = cells[0].get_text(strip=True)
        opening = cells[1].get_text(strip=True)
        active_hours[day] = opening

    return active_hours

In [11]:
records = get_records()
to_scrape = selenium_scrape_active_hours(records, headless=True)
for id, html in to_scrape.items():
    records[id].update(scrape_active_hours(html))

Scraping URLs: 100%|██████████| 68/68 [00:28<00:00,  2.35it/s]


In [14]:
with open('data/dataframes/brooklynpl.json', 'w') as f:
    json.dump(records, f, indent=4)