In [None]:
%pip install pandas openpyxl selenium seleniumbase beautifulsoup4

In [None]:
from selenium import webdriver
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
import logging
import os

In [None]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraper.log'),
        logging.StreamHandler()
    ]
)

In [None]:
class BillScraper:
    def __init__(self, url):
        self.url = url
        self.driver = None

    def initialize_driver(self):
        """Initialize the web driver with undetected-chromedriver in headless mode."""
        try:
            # Set up Chrome options for headless mode
            chrome_options = Options()
            chrome_options.add_argument("--headless")  # Run headlessly
            chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration (useful in headless mode)
            chrome_options.add_argument("--no-sandbox")  # Sometimes required for headless environments (e.g., CI servers)

            # Initialize the driver with the specified options
            self.driver = Driver(uc=True, options=chrome_options)  # Pass chrome_options to the driver
            self.driver.get(self.url)

            logging.info("Driver initialized successfully")
        except Exception as e:
            logging.error(f"Failed to initialize driver: {str(e)}")
            raise

    def wait_for_element(self, by, value, timeout=10):
        """Wait for an element to be present on the page."""
        try:
            element = WebDriverWait(self.driver, timeout).until(
                EC.presence_of_element_located((by, value))
            )
            return element
        except TimeoutException:
            logging.warning(f"Timeout waiting for element: {value}")
            return None

    def scrape_bill_data(self, year):
        try:
            select = Select(self.wait_for_element(By.ID, "dnn_ctr17012_StateNetDB_ddlYear"))
            select.select_by_value(str(year))
            search_button = self.wait_for_element(By.ID, "dnn_ctr17012_StateNetDB_btnSearch")
            self.driver.execute_script("arguments[0].scrollIntoView(true);", search_button)
            self.driver.execute_script("arguments[0].click();", search_button)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'h2Headers1'))
            )
            time.sleep(5)  # Wait for the page to load completely
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            year_data = []
            states = soup.find_all('div', class_='h2Headers1')
            for state_element in states:
                state = state_element.get_text(strip=True)
                bill_divs = state_element.find_next_siblings('div')
                for bill_div in bill_divs:
                    if 'h2Headers1' in bill_div.get('class', []):
                        break
                    bill_data = self.extract_bill_info(bill_div, state, year)
                    if bill_data:
                        year_data.append(bill_data)
            return year_data

        except Exception as e:
            logging.error(f"Error scraping bill data: {str(e)}")
            return []

    def get_field(self, elements, label):
        try:
            for element in elements:
                if label in element.get_text(strip=True):
                    next_sibling = element.next_sibling
                    while next_sibling and (next_sibling.name == 'br' or not next_sibling.strip()):
                        next_sibling = next_sibling.next_sibling
                    if next_sibling and next_sibling.name is None:
                        return next_sibling.strip()
                    else:
                        logging.warning(f"Field '{label}' not found or has no valid sibling.")
                        return ""
            logging.warning(f"Field '{label}' not found.")
            return ""
        except Exception as e:
            logging.error(f"Error getting field '{label}': {str(e)}")
            return ""

    def extract_bill_info(self, bill_div, state, year):
        try:
            bill_link_element = bill_div.find('a')
            bill_name = bill_link_element.get_text(strip=True) if bill_link_element else ""
            bill_link = bill_link_element['href'] if bill_link_element else ""

            bill_title_element = bill_div.find_next('div', style="font-weight: bold;")
            bill_title = bill_title_element.get_text(strip=True) if bill_title_element else ""

            # Collect all <b> elements after the initial <div> and before the next <hr>
            b_elements = []
            next_sibling = bill_div.find_next_sibling()
            while next_sibling and next_sibling.name != 'hr':
                if next_sibling.name == 'b':
                    b_elements.append(next_sibling)
                next_sibling = next_sibling.find_next_sibling()

            status = self.get_field(b_elements, "Status:")
            date_of_last_action = self.get_field(b_elements, "Date of Last Action:")

            def get_authors():
                primary_author = self.get_field(b_elements, "Author:")
                additional_authors = self.get_field(b_elements, "Additional Authors:")
                return f"{primary_author}; {additional_authors}".strip("; ")

            authors = get_authors()
            topics = self.get_field(b_elements, "Topics:")
            summary = self.get_field(b_elements, "Summary:")

            logging.info(f"Extracted data for bill: {bill_name}")
            return {
                'Year': year,
                'State': state,
                'Bill Name': bill_name,
                'Bill Link': bill_link,
                'Bill Title': bill_title,
                'Status': status,
                'Date of Last Action': date_of_last_action,
                'Authors': authors,
                'Topics': topics,
                'Summary': summary
            }

        except Exception as e:
            logging.error(f"Error extracting bill info: {str(e)}")
            return None

    def save_to_excel(self, data, year):
        """Save scraped data to Excel file."""
        try:
            # Create a DataFrame from the scraped data
            df = pd.DataFrame(data)
            
            # Define the directory and filename
            directory = "../Data"
            filename = f'bills_data_{year}.xlsx'
            
            # Ensure the directory exists, create if not
            if not os.path.exists(directory):
                os.makedirs(directory)

            # Full path to save the file
            file_path = os.path.join(directory, filename)
            
            # Save the DataFrame to the specified location
            df.to_excel(file_path, index=False)

            logging.info(f"Data saved to {file_path}")
        except Exception as e:
            logging.error(f"Error saving data to Excel: {str(e)}")

    def run(self, years):
        """Run the scraper for specified years."""
        try:
            self.initialize_driver()
            for year in years:
                logging.info(f"Starting scrape for year {year}")
                year_data = self.scrape_bill_data(year)
                self.save_to_excel(year_data, year)
            self.driver.quit()
        except Exception as e:
            logging.error(f"Error running scraper: {str(e)}")
            if self.driver:
                self.driver.quit()

In [None]:
if __name__ == "__main__":
    url = "https://www.ncsl.org/energy/energy-state-bill-tracking-archive-2008-2022"
    scraper = BillScraper(url)
    scraper.run([2017, 2018])
    # years = list(range(2008, 2023))  # 2008-2022
    # scraper.run(years)