# This notebook is used to test Expected Conditions, WebDriverWait, capturing the logs output in the console, and making decisions on Selenium based on the conditions

In [163]:
# Imports
# For webscraping, import the following libraries from selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Exceptions
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException

# Functions

## a) Login

In [164]:
# SSL CERT ISSUE FUNCTION
def ssl_cert():
    driver.find_element(By.ID, 'details-button').click()
    driver.find_element(By.ID, 'proceed-link').click()
    
# LOGIN FUNCTION
def login():
    # Log in
    # Find the 8th <tr>
    tr = driver.find_elements(By.XPATH, "//tr[@class='odd-row']")
    login = tr[0].find_element(By.XPATH, "//td[8]/a")
    # click the login button
    login.click()

    # Find the username and password fields
    # use web driver wait to wait for the username fields to load
    WebDriverWait(driver, 10).until(lambda driver: driver.find_element(By.ID, "__ac_name"))
    username_field = driver.find_element(By.ID, "__ac_name").send_keys("john@smith.com")
    password_field = driver.find_element(By.ID, "__ac_password").send_keys("password")
    
    # Hit enter to log in
    driver.find_element(By.ID, "__ac_password").send_keys(Keys.ENTER)

## b) Pipeline functions

In [162]:
# CHOOSE FILERS FUNCTION
def choose_filers():
    # Find select element where id='searchBoxFilingType'
    searchBoxFilingType = driver.find_element(By.ID, 'searchBoxFilingType')

    # Find the options in the select element
    options = searchBoxFilingType.find_elements(By.TAG_NAME, "option")
    # 

    # Select option value='1'
    for option in options:
        if option.get_attribute("value") == '1':
            option.click()
            break
        
# CHOOSE ISSUE FUNCTION
def choose_issue(issue):
    # Find select element where id='searchBoxIssue'
    searchBoxIssue = driver.find_element(By.ID, 'searchBoxIssue')
    # Find the options in the select element
    options = searchBoxIssue.find_elements(By.TAG_NAME, "option")
    # Select option value = issue
    for option in options:
        if option.get_attribute("value") == str(issue):
            option.click()
            break
        
# CHOOSE STOCK INDEX FUNCTION
def choose_stock_index(i):
    # Find select element where id='searchBoxStockIndex'
    searchBoxStockIndex = driver.find_element(By.ID, 'searchBoxStockIndex')
    # Find the options in the select element
    options = searchBoxStockIndex.find_elements(By.TAG_NAME, "option")
    # Click the ith option
    options[i].click()
    
# CHOOSE YEAR FUNCTION
def set_year(year):
    # Set the year
    searchBoxYear = driver.find_element(By.ID, 'searchBoxYear')
    # Find the options in the select element
    options = searchBoxYear.find_elements(By.TAG_NAME, "option")
    # Select the option value = year    
    for option in options:
        if option.get_attribute("value") == str(year):
            option.click()
            break

# GET NO OF COMPANIES FUNCTION
def get_no_of_companies():
    # Find span where id='lblFoundCount'
    NO_OF_COMPANIES = driver.find_element(By.ID, 'lblFoundCount')
    # Split the string to get the number of companies
    NO_OF_COMPANIES = NO_OF_COMPANIES.text.split(' ')[5]
    # Convert the string to integer
    NO_OF_COMPANIES = int(NO_OF_COMPANIES)
    print("Number of companies: ", NO_OF_COMPANIES)
    return NO_OF_COMPANIES

# GET NO OF PAGES FUNCTION
def get_no_of_pages(NO_OF_COMPANIES):
    # Get number of pages: 100 companies per page
    # If number of companies is less than 100, then PAGES = 1
    # If there's remainder, then add 1 to the division
    PAGES = NO_OF_COMPANIES // 100
    if NO_OF_COMPANIES % 100 != 0:
        PAGES += 1
    print("Number of pages: ", PAGES)
    return PAGES

# GET NEXT PAGE FUNCTION
def get_next_page(CURRENT_PAGE):
    next_page_str = str(CURRENT_PAGE)
    # Find the next page using web driver wait
    next_page = WebDriverWait(driver, 10).until(lambda driver: driver.find_element(By.XPATH, f".//span[@class='page ' and @data={next_page_str}]"))
    next_page.click()


## <B>C) ACCESS REPORT <B>

In [248]:
def get_report(YEAR, df):    
    # EVEN ROWS
    # For each <tr> class with <td>, get the 2nd <td> tag and store it in a list
    company_names_even = WebDriverWait(driver, 10).until(lambda driver: driver.find_elements(By.XPATH, "//tbody[@id='companyList']//tr[@class='even-row']"))

    # ODD ROWS
    # For each <tr> class with <td>, get the 2nd <td> tag and store it in a list
    company_names_odd = WebDriverWait(driver, 10).until(lambda driver: driver.find_elements(By.XPATH, "//tbody[@id='companyList']//tr[@class='odd-row']"))
    
    # Combine even and odd rows
    company_names = company_names_even + company_names_odd
    # Sort the companies by name
    company_names.sort(key=lambda x: x.find_element(By.XPATH, ".//td[2]").text)
    
    # Combine company names and industry groups
    companies_data = []
    for row in company_names:
        # use web driver wait to wait for the variables
        company_name = WebDriverWait(row, 10).until(lambda row: row.find_element(By.XPATH, ".//td[2]")).text  # Extract company name
        ticker = WebDriverWait(row, 10).until(lambda row: row.find_element(By.XPATH, ".//td[3]")).text # Extract ticker
        industry_group = WebDriverWait(row, 10).until(lambda row: row.find_element(By.XPATH, ".//td[4]")).text  # Extract industry group
        issue = WebDriverWait(row, 10).until(lambda row: row.find_element(By.XPATH, ".//td[6]")).text
        stock_index = WebDriverWait(row, 10).until(lambda row: row.find_element(By.XPATH, ".//td[7]")).text
        # if stock_index is empty, then put '-'
        if stock_index == '':
            stock_index = '-'
        view_report = WebDriverWait(row, 10).until(lambda row: row.find_element(By.XPATH, ".//td[8]")).text
        year = YEAR
        
        # company_name = row.find_element(By.XPATH, ".//td[2]").text  # Extract company name
        # ticker = row.find_element(By.XPATH, ".//td[3]").text # Extract ticker
        # industry_group = row.find_element(By.XPATH, ".//td[4]").text  # Extract industry group
        # issue = row.find_element(By.XPATH, ".//td[6]").text
        # stock_index = row.find_element(By.XPATH, ".//td[7]").text
        # # if stock_index is empty, then put '-'
        # if stock_index == '':
        #     stock_index = '-'
        # view_report = row.find_element(By.XPATH, ".//td[8]").text
        # year = YEAR
    
        # check the view_report value of that company. If it's 'View Report', then click the link and extract the data
        # Click the 'View Report' link
        if view_report == 'View Report':
            # click the 'View Report' link
            click_view_report = WebDriverWait(row, 10).until(EC.element_to_be_clickable((By.XPATH, ".//td[8]/a")))
            # when the link is clickable, click it
            click_view_report.click()
            
            # debug:
            print('View Report clicked')
            
            # Read the logs and check if 'Uncaught TypeError' is present
            # get the browser logs
            logs = driver.get_log('browser')
            
            # debug: get the logs
            print('-----------------------------------')
            print('Logs: ', logs)
            
            
            # check if 'Uncaught TypeError' is in the logs
            if 'Uncaught TypeError' in str(logs):
                word_count = 'ERROR: UNCAUGHT TYPEERROR'
                green_count = '-'
                climate_count = '-'
                sustain_count = '-'
                
                # print 'Uncaught TypeError'
                print('-----------------------------------')
                print('Company: ', company_name)
                print('UNCAUGHT TYPEERROR')
                print()
        
                companies_data.append([company_name, ticker, industry_group, issue, stock_index, year, green_count, climate_count, sustain_count,  word_count, view_report])
                
                # clear the browser logs
                driver.get_log('browser')
            
            # else, wait for the page to load using web driver wait
            else:
                # wait for the page to load using web driver wait and expected conditions
                try:
                    WebDriverWait(driver, 20).until(lambda driver: driver.find_element(By.ID, 'secAbstractDetailsOverlay').get_attribute('style') == 'display: block;')
                except TimeoutException:
                    # check the logs and see if 'Uncaught TypeError' or 'Uncaught SyntaxError' is present
                    logs = driver.get_log('browser')
                        
                    if 'Uncaught SyntaxError:' in str(logs):
                        word_count = 'ERROR: UNCAUGHT SYNTAXERROR'
                        green_count = '-'
                        climate_count = '-'
                        sustain_count = '-'
                        
                        # print 'Uncaught SyntaxError'
                        print('-----------------------------------')
                        print('Company: ', company_name)
                        print('UNCAUGHT SYNTAXERROR')
                        print()
                        
                        companies_data.append([company_name, ticker, industry_group, issue, stock_index, year, green_count, climate_count, sustain_count,  word_count, view_report])
                        
                        # change the style of the div with id="busyIndicator" to 'display: none'
                        driver.execute_script("document.getElementById('busyIndicator').style.display = 'none';")
                        # clear the browser logs
                        driver.get_log('browser')
                        # go to the next iteration
                        continue
                    # for good measure, check if 'Uncaught TypeError' is present
                    elif 'Uncaught TypeError' in str(logs):
                        word_count = 'ERROR: UNCAUGHT TYPEERROR'
                        green_count = '-'
                        climate_count = '-'
                        sustain_count = '-'
                        
                        # print 'Uncaught TypeError'
                        print('-----------------------------------')
                        print('Company: ', company_name)
                        print('UNCAUGHT TYPEERROR')
                        print()
                        
                        companies_data.append([company_name, ticker, industry_group, issue, stock_index, year, green_count, climate_count, sustain_count,  word_count, view_report])
                        
                        # clear the browser logs
                        driver.get_log('browser')
                        # go to the next iteration
                        continue
                    else:
                        # debug: 
                        print('TimeoutException Nigga')
                        
                        # check if there is no busyIndicator, 
                        if driver.find_element(By.ID, 'busyIndicator').get_attribute('style') == 'display: none;':
                            # debug
                            print('!we went here!')
                            # means that the click was not successful. click the link again
                            click_view_report.click()
                            # wait for the page to load using web driver wait
                            WebDriverWait(driver, 20).until(lambda driver: driver.find_element(By.ID, 'secAbstractDetailsOverlay').get_attribute('style') == 'display: block;')
                            pass
                        # else, the page might still be loading, so wait for the page to load using web driver wait
                        else:
                            # debug
                            print('damn it!')
                            WebDriverWait(driver, 20).until(lambda driver: driver.find_element(By.ID, 'secAbstractDetailsOverlay').get_attribute('style') == 'display: block;')
                            pass
                
                # if the page loads successfully, then extract the data
                # if text of id='toggleExtended' is 'View Extended Disclosures', then click it
                if driver.find_element(By.ID, 'toggleExtended').text == 'View Extended Disclosures':
                    driver.find_element(By.ID, 'toggleExtended').click()
                    # wait for the page to load using web driver wait
                    WebDriverWait(driver, 10).until(lambda driver: driver.find_element(By.CLASS_NAME, 'excerpt'))
                
                # data is in even/odd rows, but the class of each text is still 'excerpt'. So, we can use that to extract the data
                # find all the excerpts
                excerpts = WebDriverWait(driver, 10).until(lambda driver: driver.find_elements(By.CLASS_NAME, 'excerpt'))
                # extract the data
                data = [excerpt.text for excerpt in excerpts]
                # debug:
                print('-----------------------------------')
                print('Data: ', data)
                
                # count the words in the data
                word_count = len(' '.join(data).split())
                # get the count of the following words: 'green', 'climate', 'sustain' and lower the case
                green_count = ' '.join(data).lower().count('green')
                climate_count = ' '.join(data).lower().count('climate')
                sustain_count = ' '.join(data).lower().count('sustain')
                
                # print the data
                print('-----------------------------------')
                print('Company: ', company_name)
                print('Year: ', year)
                print('Green: ', green_count)
                print('Climate: ', climate_count)
                print('Sustain: ', sustain_count)
                print('Total Words: ', word_count)
                print()
                
                # wait for the close button to be clickable using web driver wait and expected conditions
                WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'boxclose'))).click()
                # debug:
                print('Close button clicked')
                
                # using expected conditions, wait for the page to load and the div with id='secAbstractDetailsOverlay' to have the style 'display: none;'
                WebDriverWait(driver, 10).until(lambda driver: driver.find_element(By.ID, 'secAbstractDetailsOverlay').get_attribute('style') == 'display: none;')
                # debug:
                print('Report page closed')
                # debug: just formatting
                print()
    
        # else, put 'No Disclosure' in the 'Total Words' column, and put '-' in the 'Green', 'Climate' and 'Sustain' columns
        else:
            word_count = 'No Disclosure'
            green_count = '-'
            climate_count = '-'
            sustain_count = '-'
            
            # print 'No Disclosure'
            print('-----------------------------------')
            print('Company: ', company_name)
            print('NO DISCLOSURE')
            print()
        
        companies_data.append([company_name, ticker, industry_group, issue, stock_index, year, green_count, climate_count, sustain_count,  word_count, view_report])


    print('DONE!')
    
    # Add the company names to the dataframe, where the first column is the company name and the rest are the years
    # get count of company added
    count = 0
    
    for company in companies_data:
        # if the company is not in the dataframe, add it
        if not ((df['Company'] == company[0]) & 
                (df['Issue'] == company[3]) & 
                (df['Stock Index'] == company[4]) & 
                (df['Year'] == company[5])).any():
            
            # create a new row with the necessary information and initialize Green, Climate, Sustain to 0
            new_row = pd.DataFrame([{
                'Company': company[0],
                'Ticker': company[1],
                'Industry Group': company[2],
                'Issue': company[3],
                'Stock Index': company[4],
                'Year': company[5],
                'Green': company[6],
                'Climate': company[7],
                'Sustain': company[8],
                'Total Words': company[9]
            }])
            
            # append the new row to the DataFrame using pd.concat
            df = pd.concat([df, new_row], ignore_index=True)
            # increment the count
            count += 1
            
            # Print that the company has been added
            print(f"{company[0]} added to the DataFrame!")
        
    # Print count of companies added
    print(f"Number of companies added: {count}")
    print()
    print('='*50)
    print()
    print()
            
    # Return the DataFrame
    return df

# LOGIN

In [260]:
# PATHS
CERES_URL = 'https://tools.ceres.org/resources/tools/sec-sustainability-disclosure/'

# set the options for the chrome driver
options = webdriver.ChromeOptions()
options.set_capability("goog:loggingPrefs", {"browser": "ALL"})

# import the driver 
driver = webdriver.Chrome(options=options)

# access the site 
driver.get(CERES_URL)

# await page load using web driver wait
WebDriverWait(driver, 10).until(lambda driver: driver.find_element(By.ID, 'details-button'))

# handle ssl
ssl_cert()

# await page load
WebDriverWait(driver, 100).until(lambda driver: driver.find_element(By.XPATH, "//tr[@class='odd-row']"))
# handle login
login()

# <B>RUN THE PIPELINE<B>

## 1) Set the variables

In [166]:
# RESET EVERYTHING
ISSUE = 1
YEAR = 2009
TOTAL_YEARS = 15
PAGES = 1
CURRENT_PAGE = 1
NO_OF_COMPANIES = 0

# Create pandas df to store the data, where 1st column is the company name and the rest are the years, from 2009 to 2023
# report_df = pd.DataFrame(columns = ['Company'] + ['Ticker'] + ['Industry Group'] + ['Issue'] + ['Stock Index'] + ['Year'] + ['Green'] + ['Climate'] + ['Sustain'] + ['Total Words'])

In [136]:
# Load dataframe as report_df
report_df = pd.read_csv('ceres_words_partial.csv')

## <b> 2) LETS GO<b>

In [264]:
# START THE PROCESS
# scroll to the top of the page
driver.execute_script("window.scrollTo(0, 0)")

print('===============================================')

# 1) Choose filers
choose_filers()
# 2) Choose stock index
choose_stock_index(0)

# TEST: just one year
YEAR = 2022
set_year(YEAR)
# wait for the div with id='busyIndicator' to disappear by checking the style attribute
WebDriverWait(driver, 100).until(lambda driver: driver.find_element(By.ID, 'busyIndicator').get_attribute('style') == 'display: none;')

# get current page from the span with class="page  current "
CURRENT_PAGE = int(driver.find_element(By.XPATH, ".//span[@class='page  current ']").text)

# go to the current page (DOESN'T MAKE SENSE TO DO THIS IF ITS THE CURRENT PAGE, AS THAT BUTTON CAN'T BE CLICKED)
# get_next_page(CURRENT_PAGE)

# 3) Get number of companies
NO_OF_COMPANIES = get_no_of_companies()
# 4) Get number of pages
PAGES = get_no_of_pages(NO_OF_COMPANIES)

# Loop through the issues
for i in range(5, 6):
    # lazy stuff
    issue_text = ""
    if i == 1:
        issue_text = "Climate Change"
    elif i == 2:
        issue_text = "Hydraulic Fracturing"
    elif i == 3:
        issue_text = "Water Risk"
    elif i == 4:
        issue_text = "Carobon Asset Risk"
    elif i == 5:
        issue_text = "Human & Workers' Rights"
        
    print("CURRENT ISSUE: ", issue_text)
    
    choose_issue(i)
    # wait for the div with id='busyIndicator' to disappear by checking the style attribute
    WebDriverWait(driver, 100).until(lambda driver: driver.find_element(By.ID, 'busyIndicator').get_attribute('style') == 'display: none;')
    # scroll to the top of the page
    driver.execute_script("window.scrollTo(0, 0)")
    
    # Loop through the number of pages starting from current page
    for page in range(CURRENT_PAGE, PAGES + 1):
        print("Current page: ", page)
        # get the report
        report_df = get_report(YEAR, report_df)
        
         # 6) Go to next page
        CURRENT_PAGE += 1

        # if current page is greater than the number of pages, then break
        if CURRENT_PAGE > PAGES:
            print("No more pages!")
            print()
            break
        else:
            # scroll to the top of the page
            driver.execute_script("window.scrollTo(0, 0)")
            get_next_page(CURRENT_PAGE)
        print()


Number of companies:  4666
Number of pages:  47
CURRENT ISSUE:  Human & Workers' Rights
Current page:  19
-----------------------------------
Company:  GENETHERA INC
NO DISCLOSURE

View Report clicked
-----------------------------------
Logs:  []
-----------------------------------
Data:  ['We have adopted a Code of Business Conduct and Ethics for all of our employees, including our principal executive officer and principal financial officer. Copies of our Code of Business Conduct and Ethics are available on our web site.', 'We continue to execute our business continuity plan and have implemented a comprehensive set of actions for the health and safety of our customers, employees and business partners. We have implemented work from home policies where appropriate.\nWe continue to implement strong physical and cyber-security measures to ensure our systems remain functional to both serve our operational needs with a remote workforce and to provide uninterrupted service to our customers. 

KeyboardInterrupt: 

### Clear loading icon

In [261]:
# Clear the loading icon
driver.execute_script("document.getElementById('busyIndicator').style.display = 'none';")

### Try going every page, get the company names. then check against df

In [189]:
# define function to see every company name per page
def see_company_name(df):
    company_names_even = WebDriverWait(driver, 10).until(lambda driver: driver.find_elements(By.XPATH, "//tbody[@id='companyList']//tr[@class='even-row']"))
    company_names_odd = WebDriverWait(driver, 10).until(lambda driver: driver.find_elements(By.XPATH, "//tbody[@id='companyList']//tr[@class='odd-row']"))
    company_names = company_names_even + company_names_odd
    
    # Sort the companies by name
    company_names.sort(key=lambda x: x.find_element(By.XPATH, ".//td[2]").text)
    
    # Add the company names to the df
    for row in company_names:
        company_name = row.find_element(By.XPATH, ".//td[2]").text
        # add the company name to the df
        # create a new row with the company name
        new_row = pd.DataFrame([{
            'Company': company_name
        }])
        
        # append the new row to the DataFrame using pd.concat
        df = pd.concat([df, new_row], ignore_index=True)
        
    return df

In [190]:
# names_df
names_df = pd.DataFrame(columns=['Company'])

# iterate through the pages and see the company names
driver.execute_script("window.scrollTo(0, 0)")

# 1) Choose filers
choose_filers()
# 2) Choose stock index
choose_stock_index(0)

# TEST: just one year
set_year(2022)
# wait for the div with id='busyIndicator' to disappear by checking the style attribute
WebDriverWait(driver, 100).until(lambda driver: driver.find_element(By.ID, 'busyIndicator').get_attribute('style') == 'display: none;')

# set the current page
CURRENT_PAGE = 1

# 3) Get number of companies
NO_OF_COMPANIES = get_no_of_companies()
# 4) Get number of pages
PAGES = get_no_of_pages(NO_OF_COMPANIES)

# loop through pages
for page in range(CURRENT_PAGE, PAGES + 1):
    print("Current page: ", page)
    # get the company names
    names_df = see_company_name(names_df)
    
    # get count of company names in names_df
    print('Current Count: ', len(names_df))
    
    # 6) Go to next page
    CURRENT_PAGE += 1

    # if current page is greater than the number of pages, then break
    if CURRENT_PAGE > PAGES:
        print("No more pages!")
        print()
        break
    else:
        # scroll to the top of the page
        driver.execute_script("window.scrollTo(0, 0)")
        get_next_page(CURRENT_PAGE)
    print()

Number of companies:  4666
Number of pages:  47
Current page:  1
Current Count:  100

Current page:  2
Current Count:  200

Current page:  3
Current Count:  300

Current page:  4
Current Count:  400

Current page:  5
Current Count:  500

Current page:  6
Current Count:  600

Current page:  7
Current Count:  700

Current page:  8
Current Count:  800

Current page:  9
Current Count:  900

Current page:  10
Current Count:  1000

Current page:  11
Current Count:  1100

Current page:  12
Current Count:  1200

Current page:  13
Current Count:  1300

Current page:  14
Current Count:  1400

Current page:  15
Current Count:  1500

Current page:  16
Current Count:  1600

Current page:  17
Current Count:  1700

Current page:  18
Current Count:  1800

Current page:  19
Current Count:  1900

Current page:  20
Current Count:  2000

Current page:  21
Current Count:  2100

Current page:  22
Current Count:  2200

Current page:  23
Current Count:  2300

Current page:  24
Current Count:  2400

Current pa

In [218]:
# in names_df, find all the duplicates and add them to a list
duplicates = names_df[names_df.duplicated(subset=['Company'])]['Company'].tolist()