In [1]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd
import numpy as np

In [2]:
driver = webdriver.Chrome()
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# function to check if webpage is fully loaded
def wait_for_page_to_load(driver , wait):
    # wait for the webpage to load
    page_title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == 'complete'
        )
    except:
        print(f"The page \"{page_title}\" did not get fully loaded within the given duration.")
    else:
        print(f'The page \"{page_title}\" is successfully loaded.')
        
url = 'https://finance.yahoo.com/'
driver.get(url)
wait_for_page_to_load(driver, wait)

# hovering over the element market
actions = ActionChains(driver)
markets_menu = wait.until(
    EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
)
actions.move_to_element(markets_menu).perform()

# click on trending tickers
trending_tickers = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
)

trending_tickers.click()
wait_for_page_to_load(driver, wait)

# click on Most Active
most_active = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/section[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
)
most_active.click()
wait_for_page_to_load(driver, wait)

data = []

# scraping the data
# extract , check, click infinite loop
while True:
    # scraping
    wait.until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )
    rows = driver.find_elements(By.CSS_SELECTOR, 'table tbody tr')
    
    for row in rows:
        values = row.find_elements(By.TAG_NAME, 'td')
        stock = {
            "name" : values[1].text,
            "symbol" : values[0].text,
            "price" : values[3].text,
            "change" : values[4].text,
            "volume" : values[6].text,
            "market_cap" : values[8].text,
            "pe_ratio" : values[9].text,
        }
        data.append(stock)
    
    #click next
    try:
        next_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/section/section[1]/div/div[3]/div[3]/button[3]' ))
        )
    except:
        print(f'The\"next\" button is not clickable. We havve navigated through all the pages.')
        break
    else:
        next_button.click()
        time.sleep(1)

driver.quit()

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is successfully loaded.
The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is successfully loaded.
The page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is successfully loaded.
The"next" button is not clickable. We havve navigated through all the pages.


In [3]:
data

[{'name': 'UnitedHealth Group Incorporated',
  'symbol': 'UNH',
  'price': '282.12',
  'change': '+1.06',
  'volume': '14.378M',
  'market_cap': '255.922B',
  'pe_ratio': '11.80'},
 {'name': 'Spotify Technology S.A.',
  'symbol': 'SPOT',
  'price': '700.98',
  'change': '+7.88',
  'volume': '2.019M',
  'market_cap': '1.05T',
  'pe_ratio': '192.66'},
 {'name': 'Warner Bros. Discovery, Inc.',
  'symbol': 'WBD',
  'price': '13.70',
  'change': '+0.21',
  'volume': '95.792M',
  'market_cap': '33.895B',
  'pe_ratio': '--'},
 {'name': 'Intel Corporation',
  'symbol': 'INTC',
  'price': '20.68',
  'change': '-0.02',
  'volume': '85.489M',
  'market_cap': '90.516B',
  'pe_ratio': '--'},
 {'name': 'Lucid Group, Inc.',
  'symbol': 'LCID',
  'price': '2.7900',
  'change': '-0.1300',
  'volume': '72.245M',
  'market_cap': '8.51B',
  'pe_ratio': '--'},
 {'name': 'American Airlines Group Inc.',
  'symbol': 'AAL',
  'price': '11.58',
  'change': '+0.08',
  'volume': '69.871M',
  'market_cap': '7.641B

In [4]:
len(data)

263

In [5]:
stocks_df = (
    pd
    .DataFrame(data)
    .apply(lambda col: col.str.strip() if col.dtype == 'object' else col)
    .assign(
        price=lambda df_: pd.to_numeric(df_.price),
        change=lambda df_: pd.to_numeric(df_.change.str.replace("+", "", regex=False)),
        volume=lambda df_: pd.to_numeric(df_.volume.str.replace("M", "", regex=False)),
        market_cap=lambda df_: df_.market_cap.apply(
            lambda val: float(val.replace("B", "")) if "B" in val else float(val.replace("T", "")) * 1000
        ),
        pe_ratio=lambda df_: pd.to_numeric(
            df_.pe_ratio
                .replace("--", np.nan)
                .str.replace(",", "", regex=False)
        )
    )
    .rename(columns={
        "price": "price_usd",
        "volume": "volume_M",
        "market_cap": "market_cap_B"
    })
)

stocks_df.dtypes

name             object
symbol           object
price_usd       float64
change          float64
volume_M        float64
market_cap_B    float64
pe_ratio        float64
dtype: object

In [6]:
stocks_df.to_excel('yahoo-stocks-data.xlsx', index = False)