In [178]:
import numpy as np
import requests
from bs4 import BeautifulSoup, NavigableString, CData, Tag
from io import BytesIO
import PyPDF2

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys


class MyBeautifulSoup(BeautifulSoup):
    def _all_strings_plus(  self, strip=True, types=NavigableString, 
                                aRef={'a': lambda a: f"<{a.get('href', '')}>"}, 
                                skipTags=['script', 'style']    ):
        # verify types
        if hasattr(types,'__iter__') and not isinstance(types,type):
            types = tuple([t for t in types if isinstance(t, type)])
        if not (types and isinstance(types,(type,tuple))): types = NavigableString
        
        # skip text in tags included in aRef
        skipTags += list(aRef.keys())
        
        for descendant in self.descendants:
            # yield extra strings according to aRef
            if isinstance(descendant, Tag) and descendant.name in aRef:
                extraStr = aRef[descendant.name](descendant)
                if isinstance(extraStr, str): yield extraStr

            # skip text nodes DIRECTLY inside a Tag in aRef
            if descendant.parent.name in aRef: continue

            # skip ALL text nodes inside skipTags 
            if skipTags and descendant.find_parent(skipTags): continue

            # default behavior
            if not isinstance(descendant, types): continue

            if strip:
                descendant = descendant.strip()
                if len(descendant) == 0: continue
            yield descendant
    
    def get_text_plus(self, separator=" ", srcUrl=None, **aspArgs):
        if srcUrl and isinstance(srcUrl, str):
            def hrefStr(aTag):
                href = aTag.get('href')
                if not (href is None or href.startswith('javascript')):
                    return f"<{urljoin(srcUrl, href)}>"
            aspArgs.setdefault('aRef', {})
            aspArgs['aRef']['a'] = hrefStr
        
        return separator.join(self._all_strings_plus(**aspArgs)) 

### BeautifulSoup & PDF testing

In [None]:
source_html = requests.get('https://azure-na-assets.contentstack.com/v3/assets/blt71bfe6e8a1c2d265/bltc320e2642a070852/Exelon_Environment_Policy_Poster-8.5x11v2.pdf')

In [None]:
raw_data = source_html.content
with BytesIO(raw_data) as data:
    read_pdf = PyPDF2.PdfFileReader(data)

    for page in range(read_pdf.getNumPages()):
        print(read_pdf.getPage(page).extractText())

In [None]:
source_html = requests.get('https://www.bge.com/safety-community/environment/our-initiatives')

In [None]:
soup = MyBeautifulSoup(source_html.text, 'html.parser')
# soup.text

In [None]:
for script in soup(['script', 'style']):
    script.decompose()
len(soup.get_text_plus())

In [None]:
text_links_parsed = ' '.join(soup.get_text_plus().split())

In [None]:
soup.get_text_plus()

### Selenium Testing

In [42]:
# Function to recursively find text in all iframes in a web page. Maybe I dont need this. 
def find_all_iframes(driver) -> list:
    iframes = driver.find_elements_by_xpath("//iframe")
    text_per_frame = []
    print(iframes)
    frame_soup = MyBeautifulSoup(driver.page_source, 'html.parser')
    frame_text_links = frame_soup.get_text_plus()
    text_per_frame.append(frame_text_links)
    for index, iframe in enumerate(iframes):
        # Your sweet business logic applied to iframe goes here.

        
        driver.switch_to.frame(index)
        # html = driver.execute_script("return document.body.innerHTML;")
        frame_soup = MyBeautifulSoup(driver.page_source, 'html.parser')
        frame_text_links = frame_soup.get_text_plus()
        text_per_frame.append(frame_text_links)
        nested_iframe_text = find_all_iframes(driver)
        if nested_iframe_text:
            text_per_frame.append(nested_iframe_text)
        driver.switch_to.parent_frame()
        
    return text_per_frame

In [35]:
# Create web driver object
from selenium.webdriver.common.by import By
driver = webdriver.Chrome('chromedriver-win64\chromedriver.exe')

In [126]:
# Method to navigate to provided URL
actions = ActionChains(driver)
wait = WebDriverWait(driver, 20)
driver.get('https://www.bge.com/safety-community/environment/our-initiatives')

In [180]:
# Augment HTML with text in shadow DOM
shdw_dom_text = []
all_elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*')))
for el in all_elements:
    try:
        if driver.execute_script('return arguments[0].shadowRoot', el):
            # el.click()
            
            shdw_el = driver.execute_script(f"return document.querySelector('{el.tag_name}').shadowRoot.querySelector('div')")
            # print(shdw_el.find_elements(By.CSS_SELECTOR, "p"))
            frame_soup = MyBeautifulSoup(shdw_el.get_attribute('innerHTML'), 'html.parser')
            # frame_text_links = frame_soup.get_text_plus()
            print('-'*25)
            print(frame_soup.get_text_plus(separator='\n'))
            
    except Exception as e:
        pass

-------------------------
<#mainContent>
<https://www.bge.com>
<https://secure.bge.com/accounts/login>
<https://www.bge.com//Pages/UserRegistrationLanding.aspx>
Menu
<https://www.bge.com/outages/experiencing-an-outage>
<https://www.bge.com/safety-community/safety/natural-gas-safety>
<https://www.bge.com/my-account/my-dashboard/pay-my-bill>
<https://secure.bge.com/CustomerServices/service/landing>
<https://www.bge.com/MyAccount/CustomerSupport/Pages/ContactUs.aspx>
Language
EN
<https://www.bge.com>
<https://es.bge.com:443/>
<https://secure.bge.com/accounts/login>
<https://www.bge.com//Pages/UserRegistrationLanding.aspx>
My Account
<https://secure.bge.com/MyAccount/MyBillUsage/pages/secure/MyBillUsage.aspx>
Back
<https://www.bge.com/my-account/my-dashboard/pay-my-bill>
<https://secure.bge.com/MyAccount/MyBillUsage/pages/secure/MyBillDetails.aspx>
<https://www.bge.com/my-account/my-dashboard/billing-options>
<https://secure.bge.com/MyAccount/MyBillUsage/Pages/Secure/AccountHistory.aspx>
<

In [None]:
inter_frame_text = find_all_iframes(driver)

In [54]:
iframes = driver.find_elements_by_xpath("//iframe")
iframes

[<selenium.webdriver.remote.webelement.WebElement (session="5208603e2fe9d0d22d54fde2f26ee173", element="91A8327A3AA54DEC0366A0BDB01552C1_element_43")>,
 <selenium.webdriver.remote.webelement.WebElement (session="5208603e2fe9d0d22d54fde2f26ee173", element="91A8327A3AA54DEC0366A0BDB01552C1_element_44")>,
 <selenium.webdriver.remote.webelement.WebElement (session="5208603e2fe9d0d22d54fde2f26ee173", element="91A8327A3AA54DEC0366A0BDB01552C1_element_45")>,
 <selenium.webdriver.remote.webelement.WebElement (session="5208603e2fe9d0d22d54fde2f26ee173", element="91A8327A3AA54DEC0366A0BDB01552C1_element_1262")>,
 <selenium.webdriver.remote.webelement.WebElement (session="5208603e2fe9d0d22d54fde2f26ee173", element="91A8327A3AA54DEC0366A0BDB01552C1_element_1260")>]

In [156]:
# Attribute that contains HTML of current page. Reference to Selenium methods
drop_downs = driver.find_element(By.CSS_SELECTOR, "body")#.shadow_root
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
# spans = driver.find_elements_by_class_name('euds-accordion')
# print([span.text for span in spans])

# shadow_dict = driver.execute_script('return arguments[0].shadowRoot', drop_downs)
data = driver.execute_script('return arguments[0].shadowRoot', drop_downs)
data

In [160]:
# javascript is still showing up...
soup = MyBeautifulSoup(driver.page_source, 'html.parser')
soup.get_text_plus()

"Our Initiatives | BGE - An Exelon Company Due to technical issues, currently we are only able to provide eligible bill assistance recommendations through the Assistance Finder if you go through the process as a guest. Our teams are working on resolving this and we apologize for any inconvenience. Please use this link <https://secure.bge.com/assistance/finder/account-type> to evaluate assistance options you may be eligible for. Customer bills will increase effective with usage starting in January 2024 as authorized by the Maryland Public Service Commission. The average residential electric bill will increase by $1.76 per month and the average residential natural gas bill will increase by $6.43 per month. See <https://www.bge.com/smart-energy/innovation-technology/multi-year-plan> for details. Our Initiatives Our Initiatives Putting our commitment to the environment into action, we are involved in various conservation projects throughout our service area. <> <javascript:window.parent.bi

In [157]:
html = driver.execute_script("return document.body.innerHTML;", drop_downs)