In [1]:
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from time import sleep

# Configure Edge options
options = webdriver.EdgeOptions()
options.add_argument('--inprivate')

# Initialize the Edge driver
driver = webdriver.Edge(options=options)
driver.maximize_window()
driver.get('https://www.google.com/')

In [2]:
def navigate_and_extract(url):
    driver.get(url)
    sleep(3) 

    try:
        close_button = driver.find_element(By.XPATH, '//button[@class="close"]')
        if close_button:
            close_button.click()
    except:
        pass

    driver.execute_script("window.scrollTo(0, window.scrollY + 700)")
    sleep(1)

    try:
        financial_info_tab = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "#page-view > main > section:nth-child(1) > bvl-issuer-details > div > div > bvl-tabs > ul > li:nth-child(5) > a"))
        )
        financial_info_tab.click()
        sleep(2)

        driver.execute_script("window.scrollTo(0, window.scrollY + 500)")

        try:
            WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#page-view > main > section:nth-child(1) > bvl-issuer-details > div > div > bvl-tabs > bvl-tab:nth-child(7) > div > div > bvl-resources > bvl-toolbar > div > div:nth-child(4) > div > div.g-site-select--label > i"))
            ).click()
            WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#page-view > main > section:nth-child(1) > bvl-issuer-details > div > div > bvl-tabs > bvl-tab:nth-child(7) > div > div > bvl-resources > bvl-toolbar > div > div:nth-child(4) > div > div.g-site-select--list.default > div:nth-child(4)"))
            ).click()
            sleep(2)
        except (TimeoutException, NoSuchElementException):
            print(f"No se encontró el selector de año en {url}. Continuando con el siguiente enlace.")
            return  

        try:
            WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#page-view > main > section:nth-child(1) > bvl-issuer-details > div > div > bvl-tabs > bvl-tab:nth-child(7) > div > div > bvl-resources > bvl-toolbar > div > div.g-site-select > div.g-site-select--label > i"))
            ).click()
            WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#page-view > main > section:nth-child(1) > bvl-issuer-details > div > div > bvl-tabs > bvl-tab:nth-child(7) > div > div > bvl-resources > bvl-toolbar > div > div.g-site-select > div.g-site-select--list.default > div:nth-child(2)"))
            ).click()
            sleep(2)
        except (TimeoutException, NoSuchElementException):
            print(f"No se encontró el selector de tipo en {url}. Continuando con el siguiente enlace.")
            return 

        driver.execute_script('window.scrollTo(document.body.scrollHeight, 1420);')

    except (TimeoutException, NoSuchElementException) as e:
        print(f"Error en {url}: {e}. Continuando con el siguiente enlace.")
        return     


In [3]:
def extract_dates_and_links():
    dates = []
    links = []

    try:
        for i in range(1, 9):
            date = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, f"#page-view > main > section:nth-child(1) > bvl-issuer-details > div > div > bvl-tabs > bvl-tab:nth-child(7) > div > div > bvl-resources > section > bvl-table-file-link > table > tbody > tr:nth-child({i}) > td:nth-child(2) > span"))
            ).text
            dates.append(date)

            link = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, f"#page-view > main > section:nth-child(1) > bvl-issuer-details > div > div > bvl-tabs > bvl-tab:nth-child(7) > div > div > bvl-resources > section > bvl-table-file-link > table > tbody > tr:nth-child({i}) > td:nth-child(1) > a"))
            )
            links.append(link)
            print(f"Fecha {i}: {date}, Enlace {i}: {link.text}")
    except TimeoutException as e:
        print(f"Error extrayendo fechas y enlaces: {e}")
    sleep(2)

    return dates, links

In [16]:
def extract_financial_data(links, dates):
    financial_data_list = []

    for i in range(4):
        try:
            try:
                overlay = driver.find_element(By.CSS_SELECTOR, "div.media-player")
                driver.execute_script("arguments[0].style.display = 'none';", overlay)
                print("Overlay (media-player) ocultado correctamente.")
            except Exception as e:
                print("No se encontró el overlay (media-player). Continuando con normalidad.")

            links[i].click()
            company_name = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="page-view"]/main/section/bvl-issuer-details/div/div/bvl-tabs/bvl-tab[5]/div/div[2]/bvl-resources/section/bvl-section-header/header/div[1]/h1'))
                ).text
            financial_data = {
                'COMPANY_NAME': company_name,
                'DATE': dates[i],
                'TOTAL ACTIVOS': extract_numeric_value("body > bvl-shared-modal > div > div > div > div.shared-modal-body > bvl-dynamic-table > div.g-site-table-content > table > tbody > tr:nth-child(38) > td:nth-child(3)"),
                'TOTAL PASIVOS': extract_numeric_value("body > bvl-shared-modal > div > div > div > div.shared-modal-body > bvl-dynamic-table > div.g-site-table-content > table > tbody > tr:nth-child(69) > td:nth-child(3)"),
                'TOTAL PATRIMONIO': extract_numeric_value("body > bvl-shared-modal > div > div > div > div.shared-modal-body > bvl-dynamic-table > div.g-site-table-content > table > tbody > tr:nth-child(78) > td:nth-child(3)"),
                'INGRESOS ORDINARIOS': 0,
                'GANANCIA BRUTA': 0,     
                'GANANCIA OPERATIVA': 0, 
                'GANANCIA NETA': 0       
            }

            close_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "body > bvl-shared-modal > div > div > div > div.shared-modal-header > button"))
            )
            close_button.click()

            financial_data_list.append(financial_data)
            print(f"Datos extraídos para {company_name} en la fecha {dates[i]}: {financial_data}")
        except Exception as e:
            print(f"Error extrayendo datos financieros (activos, pasivos, patrimonio): {e}")

    for i in range(4, 8):
        try:
            links[i].click()

            financial_data = {
                'DATE': dates[i],
                'INGRESOS ORDINARIOS': extract_numeric_value("body > bvl-shared-modal > div > div > div > div.shared-modal-body > bvl-dynamic-table > div.g-site-table-content > table > tbody > tr:nth-child(2) > td:nth-child(3)"),
                'GANANCIA BRUTA': extract_numeric_value("body > bvl-shared-modal > div > div > div > div.shared-modal-body > bvl-dynamic-table > div.g-site-table-content > table > tbody > tr:nth-child(4) > td:nth-child(3)"),
                'GANANCIA OPERATIVA': extract_numeric_value("body > bvl-shared-modal > div > div > div > div.shared-modal-body > bvl-dynamic-table > div.g-site-table-content > table > tbody > tr:nth-child(10) > td:nth-child(3)"),
                'GANANCIA NETA': extract_numeric_value("body > bvl-shared-modal > div > div > div > div.shared-modal-body > bvl-dynamic-table > div.g-site-table-content > table > tbody > tr:nth-child(26) > td:nth-child(3)"),
            }

            close_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "body > bvl-shared-modal > div > div > div > div.shared-modal-header > button"))
            )
            close_button.click()

            for data in financial_data_list:
                if data['DATE'] == financial_data['DATE']:
                    for key, value in financial_data.items():
                        if key not in data or data[key] == 0:
                            data[key] = value
                    print(f"Datos actualizados para la fecha {dates[i]}: {financial_data}")
        except Exception as e:
            print(f"Error extrayendo datos financieros (ingresos, ganancias): {e}")

    return financial_data_list

def extract_numeric_value(selector):
    try:
        value = driver.execute_script(
            f"return document.querySelector('{selector}').textContent"
        )
        if value and len(value.strip()) > 0:
            return int(float(value.replace(",", "")))
        return 0
    except Exception as e:
        print(f"Error extrayendo valor numérico: {e}")
        return 0

In [20]:
# List of company URLs to scrape
company_urls = [
    'https://www.bvl.com.pe/emisores/detalle?companyCode=21400',
    'https://www.bvl.com.pe/emisores/detalle?companyCode=52105',
    'https://www.bvl.com.pe/emisores/detalle?companyCode=74100',
    'https://www.bvl.com.pe/emisores/detalle?companyCode=70500',
    'https://www.bvl.com.pe/emisores/detalle?companyCode=73600',
    'https://www.bvl.com.pe/emisores/detalle?companyCode=21802',
    'https://www.bvl.com.pe/emisores/detalle?companyCode=73658',
]

# Data storage
financial_data_list = []

for url in company_urls:
    print(f"Procesando URL: {url}")
    try:
        navigate_and_extract(url)
        dates, links = extract_dates_and_links()
        financial_data = extract_financial_data(links, dates)
        financial_data_list.extend(financial_data)
    except Exception as e:
        print(f"Error procesando la URL {url}: {e}")

# Convert the list of dictionaries to a DataFrame
financial_data_df = pd.DataFrame(financial_data_list)

# Save the DataFrame to an Excel file
financial_data_df.to_excel('financial_data.xlsx', index=False)

Procesando URL: https://www.bvl.com.pe/emisores/detalle?companyCode=21400
Fecha 1: 01/08/2022, Enlace 1: Estado de Situación Financiera
Fecha 2: 02/05/2022, Enlace 2: Estado de Situación Financiera
Fecha 3: 15/02/2023, Enlace 3: Estado de Situación Financiera
Fecha 4: 31/10/2022, Enlace 4: Estado de Situación Financiera
Fecha 5: 01/08/2022, Enlace 5: Estado de Resultados
Fecha 6: 31/10/2022, Enlace 6: Estado de Resultados
Fecha 7: 02/05/2022, Enlace 7: Estado de Resultados
Fecha 8: 15/02/2023, Enlace 8: Estado de Ganancias y P¿rdidas
Overlay (media-player) ocultado correctamente.
Datos extraídos para ALICORP S.A.A. Y SUBSIDIARIAS en la fecha 01/08/2022: {'COMPANY_NAME': 'ALICORP S.A.A. Y SUBSIDIARIAS', 'DATE': '01/08/2022', 'TOTAL ACTIVOS': 14586854, 'TOTAL PASIVOS': 847192, 'TOTAL PATRIMONIO': 3371872, 'INGRESOS ORDINARIOS': 0, 'GANANCIA BRUTA': 0, 'GANANCIA OPERATIVA': 0, 'GANANCIA NETA': 0}
Overlay (media-player) ocultado correctamente.
Datos extraídos para ALICORP S.A.A. Y SUBSIDIA

In [21]:
financial_data_df

Unnamed: 0,COMPANY_NAME,DATE,TOTAL ACTIVOS,TOTAL PASIVOS,TOTAL PATRIMONIO,INGRESOS ORDINARIOS,GANANCIA BRUTA,GANANCIA OPERATIVA,GANANCIA NETA
0,ALICORP S.A.A. Y SUBSIDIARIAS,01/08/2022,14586854,847192,3371872,3909390,696116,315392,142175
1,ALICORP S.A.A. Y SUBSIDIARIAS,02/05/2022,13513195,847192,3032585,3348494,650638,311727,159838
2,ALICORP S.A.A. Y SUBSIDIARIAS,15/02/2023,14011900,847192,2920698,4163506,665276,241660,100180
3,ALICORP S.A.A. Y SUBSIDIARIAS,31/10/2022,15075038,847192,3398818,3989503,749241,325130,131306
4,FALABELLA PERU S.A. Y SUBSIDIARIAS,04/11/2022,3629942,515606,2379421,102651,63373,48458,28835
5,FALABELLA PERU S.A. Y SUBSIDIARIAS,03/05/2022,3633096,515606,2328615,98374,60249,49524,32157
6,FALABELLA PERU S.A. Y SUBSIDIARIAS,28/02/2023,3645520,515606,2393765,114438,70542,31220,14344
7,FALABELLA PERU S.A. Y SUBSIDIARIAS,11/08/2022,3650132,515606,2350586,97444,55181,41254,21971
8,FERREYCORP S.A.A. Y SUBSIDIARIAS,27/07/2022,6045603,958894,2380441,1581346,393956,161079,61287
9,FERREYCORP S.A.A. Y SUBSIDIARIAS,15/02/2023,6248756,946063,2488337,1874353,474417,179886,139155
