In [None]:
# ===== Selenium Imports =====
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains

# ===== Standard Library =====
import time

# ===== Data Handling =====
import pandas as pd
import numpy as np

In [None]:
class StocksScraper:

    def __init__(self, driver, timeout=10):
        self.driver = driver
        self.wait = WebDriverWait(self.driver, timeout=timeout)
        self.data = []


    def wait_for_the_page_to_load(self):
        page_title = self.driver.title
        try:
            self.wait.until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
        except:
            print(f'The page "{page_title}" did not get fully loaded within the given duration')
        else:
            print(f'The page "{page_title}" is Successfully loaded!')


    def access_url(self, url):
        self.driver.get(url)
        self.wait_for_the_page_to_load()


    def access_most_activeStocks(self):
        # Hover to Markets menu
        actions = ActionChains(self.driver)

        markets_menu = self.wait.until(
            EC.presence_of_element_located(
                (By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/nav[1]/ol[1]/li[3]/a[1]/div[1]')
            )
        )

        actions.move_to_element(markets_menu).perform()

        # Click on Stocks
        stocks = self.wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/nav[1]/ol[1]/li[3]/ol[1]/li[1]/a[1]')
            )
        )
        stocks.click()
        self.wait_for_the_page_to_load()

        # Click on Trending
        trending_tickers = self.wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '/html[1]/body[1]/div[2]/div[3]/main[1]/section[1]/section[1]/section[1]/section[1]/section[1]/div[1]/div[1]/div[1]/a[2]')
            )
        )
        trending_tickers.click()
        self.wait_for_the_page_to_load()

        # Click on Most Active
        active_stk = self.wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '/html[1]/body[1]/div[2]/div[3]/main[1]/section[1]/section[1]/section[1]/section[1]/section[1]/div[1]/div[1]/div[1]/a[1]')
            )
        )
        active_stk.click()
        self.wait_for_the_page_to_load()


    def extract_StocksData(self):

        while True:
            self.wait.until(
                EC.presence_of_element_located((By.TAG_NAME, "table"))
            )

            rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
            print(len(rows))

            for row in rows:
                values = row.find_elements(By.TAG_NAME, "td")

                stock = {
                    "name": values[1].text,
                    "symbol": values[0].text,
                    "price": values[3].text,
                    "change": values[4].text,
                    "volume": values[6].text,
                    "market_cap": values[8].text,
                    "pe_ratio": values[9].text,
                }

                self.data.append(stock)

            # Click Next button
            try:
                next_btn = self.wait.until(
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, 'button[data-testid="next-page-button"]')
                    )
                )
            except:
                print('The "next" button is not clickable. We have navigated through all the pages.')
                break
            else:
                self.driver.execute_script(
                    "arguments[0].scrollIntoView({block:'center'});",
                    next_btn
                )
                time.sleep(0.5)
                next_btn.click()


    def clean_and_save_data(self, filename="temp"):

        stocks_df = (
            pd.DataFrame(self.data)
            .apply(lambda col: col.str.strip() if col.dtype == "object" else col)
            .assign(
                price=lambda df_: pd.to_numeric(df_.price),
                change=lambda df_: pd.to_numeric(
                    df_.change.str.replace("+", "")
                ),
                volume=lambda df_: pd.to_numeric(
                    df_.volume.str.replace("M", "")
                ),
                market_cap=lambda df_: df_.market_cap.apply(
                    lambda val: float(val.replace("B", ""))
                    if "B" in val
                    else float(val.replace("T", "")) * 1000
                ),
                pe_ratio=lambda df_: (
                    df_.pe_ratio
                    .replace("--", np.nan)
                    .str.replace(",", "")
                    .pipe(lambda col: pd.to_numeric(col))
                ),
            )
            .rename(
                columns={
                    "price": "price_usd",
                    "volume": "volume_m",
                    "market_cap": "market_cap_B",
                }
            )
        )

        stocks_df.to_csv(f"{filename}.csv", index=False)

In [None]:
if __name__ == "__main__":
    driver = webdriver.Chrome()
    driver.maximize_window()

    url = 'https://finance.yahoo.com/'
    scraper = StocksScraper(driver, 5)

    scraper.access_url(url)
    scraper.access_most_activeStocks()
    scraper.extract_StocksData()
    scraper.clean_and_save_data("yahoo-temp")

    driver.close()

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is Successfully loaded!
The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is Successfully loaded!
The page "Most Active Stocks: US stocks with the highest trading volume today - Yahoo Finance" is Successfully loaded!
The page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is Successfully loaded!


In [None]:
scraper.data #<-- for checking 