In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
def fetch_earnings_data(ticker):
    def initialize_driver():
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--window-size=1920x1080')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        return driver
    
    url = f"https://finance.yahoo.com/calendar/earnings?symbol={ticker}"
    
    for _ in range(3):  # Intentar un máximo de 3 veces
        driver = initialize_driver()
        driver.get(url)
        
        try:
            # Aceptar cookies si el botón está presente
            accept_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="consent-page"]/div/div/div/form/div[2]/div[2]/button[1]'))
            )
            accept_button.click()
        except Exception as e:
            print("Cookie accept button not found or not clickable:", e)

        try:
            # Esperar a que la página se cargue completamente y la tabla esté presente
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'table.W\\(100\\%\\) tbody'))
            )
            
            # Encontrar las filas de la tabla de earnings
            rows = driver.find_elements(By.CSS_SELECTOR, 'table.W\\(100\\%\\) tbody tr')
            data = []

            for row in rows:
                cols = row.find_elements(By.TAG_NAME, 'td')
                if len(cols) == 6:
                    cols = [elem.text for elem in cols]
                    data.append(cols)
                else:
                    print(f"Unexpected number of columns ({len(cols)}) found. Retrying...")
                    driver.quit()  # Cerrar el WebDriver antes de volver a intentar
                    break  # Salir del bucle actual y reintentar
            else:
                # Si todas las filas tienen el número correcto de columnas, salir del bucle
                driver.quit()
                break
        except Exception as e:
            print("Error while fetching the data:", e)
            driver.quit()
    else:
        print("Failed to fetch data with the correct number of columns after 3 attempts")
        return pd.DataFrame()
    
    # Crear el DataFrame
    columns = ['Symbol', 'Company', 'Earnings Date', 'EPS Estimate', 'Reported EPS', 'Surprise(%)']
    df = pd.DataFrame(data, columns=columns)
    
    # Conversión de tipos de datos
    df['Earnings Date'] = pd.to_datetime(df['Earnings Date'].str[:12], format='%b %d, %Y', errors='coerce')
    df['EPS Estimate'] = pd.to_numeric(df['EPS Estimate'], errors='coerce')
    df['Reported EPS'] = pd.to_numeric(df['Reported EPS'], errors='coerce')
    df['Surprise(%)'] = pd.to_numeric(df['Surprise(%)'].str.replace('%', ''), errors='coerce')
    
    return df

# Ejemplo de uso:
# ticker = 'AAPL'
# df = fetch_earnings_data(ticker)
# print(df)


In [3]:
def fetch_earnings_data_parallel(ticker_list):
    return pd.concat(Parallel(n_jobs=-1)(delayed(fetch_earnings_data)(ticker) for ticker in  tqdm(ticker_list, desc="Fetching earnings data")), ignore_index=True)

In [4]:
ticker_lst = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'FB']

In [5]:
earnings_data = fetch_earnings_data_parallel(ticker_lst)

Fetching earnings data: 100%|██████████| 5/5 [00:00<00:00, 521.45it/s]


Unexpected number of columns (7) found. Retrying...
Unexpected number of columns (7) found. Retrying...
Unexpected number of columns (7) found. Retrying...


In [None]:
earnings_data

In [14]:
# Combinar los resultados en un solo DataFrame
combined_df = pd.concat(results, ignore_index=True)

In [None]:
combined_df

In [2]:
def fetch_earnings_data(ticker):
    # Set up Selenium to run headlessly
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920x1080')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    # Initialize ChromeDriver using webdriver_manager
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    url = f"https://finance.yahoo.com/calendar/earnings?symbol={ticker}"
    driver.get(url)
    
    # Accept cookies if the button is present
    try:
        accept_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="consent-page"]/div/div/div/form/div[2]/div[2]/button[1]'))
        )
        accept_button.click()
    except Exception as e:
        print("Cookie accept button not found or not clickable:", e)

    # Wait for the page to fully load and the table to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'table.W\\(100\\%\\) tbody'))
    )

    # Find the rows of the earnings table
    rows = driver.find_elements(By.CSS_SELECTOR, 'table.W\\(100\\%\\) tbody tr')

    data = []

    for row in rows:
        cols = row.find_elements(By.TAG_NAME, 'td')
        cols = [elem.text for elem in cols]
        data.append(cols)

    # Close the WebDriver
    driver.quit()

    # Assuming the data structure is as expected, create a DataFrame
    columns = ['Symbol', 'Company', 'Earnings Date', 'EPS Estimate', 'Reported EPS', 'Surprise(%)']
    df = pd.DataFrame(data, columns=columns)
    
    df['Earnings Date'] = pd.to_datetime(df['Earnings Date'].str[:12], format='%b %d, %Y', errors='coerce')
    df['EPS Estimate'] = pd.to_numeric(df['EPS Estimate'], errors='coerce')
    df['Reported EPS'] = pd.to_numeric(df['Reported EPS'], errors='coerce')
    df['Surprise(%)'] = pd.to_numeric(df['Surprise(%)'], errors='coerce')

    return df

In [6]:
# Example usage:
ticker = "SAP"
earnings_data = fetch_earnings_data(ticker)

ValueError: 6 columns passed, passed data had 7 columns

In [4]:
earnings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Symbol         100 non-null    object        
 1   Company        100 non-null    object        
 2   Earnings Date  100 non-null    datetime64[ns]
 3   EPS Estimate   94 non-null     float64       
 4   Reported EPS   93 non-null     float64       
 5   Surprise(%)    93 non-null     float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 4.8+ KB


In [40]:
earnings_data['Earnings Date'] = pd.to_datetime(earnings_data['Earnings Date'].str[:12], format='%b %d, %Y', errors='coerce')
earnings_data['EPS Estimate'] = pd.to_numeric(earnings_data['EPS Estimate'], errors='coerce')
earnings_data['Reported EPS'] = pd.to_numeric(earnings_data['Reported EPS'], errors='coerce')
earnings_data['Surprise(%)'] = pd.to_numeric(earnings_data['Surprise(%)'], errors='coerce')

In [42]:
earnings_data.head(20)

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%)
0,MSFT,Microsoft Corp,2025-04-23,,,
1,MSFT,Microsoft Corp,2025-01-28,,,
2,MSFT,Microsoft Corp,2024-12-06,,,
3,MSFT,Microsoft Corp,2024-10-22,,,
4,MSFT,Microsoft Corp,2024-07-23,2.93,,
5,MSFT,Microsoft Corp,2024-06-06,,,
6,MSFT,Microsoft Corp,2024-05-29,,,
7,MSFT,Microsoft Corp,2024-05-21,2.6,2.71,4.23
8,MSFT,Microsoft Corporation,2024-04-25,2.82,2.94,4.32
9,MSFT,Microsoft Corp,2024-03-06,2.78,2.93,5.4


In [6]:
# Extract the time and timezone information into a new column
earnings_data['Earnings Time'] = earnings_data['Earnings Date'].str.extract(r'(\d{1,2} [AP]MEDT)')

# Extract just the date part from the "Earnings Date" column
earnings_data['Earnings Date'] = earnings_data['Earnings Date'].str.extract(r'(\b\w+ \d{1,2}, \d{4})')

# Convert string date to datetime
earnings_data['Earnings Date'] = pd.to_datetime(earnings_data['Earnings Date'], format='%b %d, %Y')

# Convert datetime to desired string format
earnings_data['Earnings Date'] = earnings_data['Earnings Date'].dt.strftime('%Y-%m-%d')

#earnings_data['Surprise(%)'] = earnings_data['Surprise(%)'].str.replace('+', '').astype(float)

earnings_data.tail(5)


Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Earnings Time
95,MSFT,Microsoft Corporation,,0.21,0.28,29.06,
96,MSFT,Microsoft Corporation,,0.21,0.21,0.8,
97,MSFT,Microsoft Corporation,,0.25,0.25,-3.56,
98,MSFT,Microsoft Corporation,,0.21,0.25,15.25,
99,MSFT,Microsoft Corporation,,0.2,0.22,9.5,


In [7]:
earnings_data

Unnamed: 0,0,1,2,3,4,5
0,MSFT,Microsoft Corp,"Apr 23, 2025, 4 PMEDT",-,-,-
1,MSFT,Microsoft Corp,"Jan 28, 2025, 4 PMEST",-,-,-
2,MSFT,Microsoft Corp,"Dec 06, 2024, 11 AMEST",-,-,-
3,MSFT,Microsoft Corp,"Oct 22, 2024, 6 AMEDT",-,-,-
4,MSFT,Microsoft Corp,"Jul 23, 2024, 6 AMEDT",2.93,-,-
...,...,...,...,...,...,...
95,MSFT,Microsoft Corporation,"Oct 17, 2002, 12 AMEDT",0.21,0.28,+29.06
96,MSFT,Microsoft Corporation,"Jul 18, 2002, 12 AMEDT",0.21,0.21,+0.8
97,MSFT,Microsoft Corporation,"Apr 18, 2002, 12 AMEDT",0.25,0.25,-3.56
98,MSFT,Microsoft Corporation,"Jan 17, 2002, 12 AMEST",0.21,0.25,+15.25
