In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import time

In [None]:
driver = webdriver.Chrome()
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# function to the load the checking of the website
def wait_for_the_page_to_load(driver, wait):
    page_title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except:
        print(f"The page \"{page_title}\" did not get fully loaded within the given duration")
    else:
        print(f"The page \"{page_title}\" is Successfully loaded!")


url = 'https://finance.yahoo.com/'
driver.get(url)
wait_for_the_page_to_load(driver, wait)

# hovering on markets menu
actions = ActionChains(driver)
markets_menu = wait.until(
    EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/nav[1]/ol[1]/li[3]/a[1]/div[1]'))
)

actions.move_to_element(markets_menu).perform()

# Navigation = click on stocks --> trending --> most active 
stocks = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/nav[1]/ol[1]/li[3]/ol[1]/li[1]/a[1]'))
)
stocks.click()
wait_for_the_page_to_load(driver, wait)

trending_tickers = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/div[3]/main[1]/section[1]/section[1]/section[1]/section[1]/section[1]/div[1]/div[1]/div[1]/a[2]'))
)
trending_tickers.click()
wait_for_the_page_to_load(driver, wait)

active_stk = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/div[3]/main[1]/section[1]/section[1]/section[1]/section[1]/section[1]/div[1]/div[1]/div[1]/a[1]'))
)
active_stk.click()
wait_for_the_page_to_load(driver, wait)

data = []
# SCRAPING THE DATA
while True:
    # scraping 
    wait.until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )
    
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    print(len(rows))

    for row in rows:
        values = row.find_elements(By.TAG_NAME, 'td') 

        stock = {
            "name": values[1].text,
            "symbol": values[0].text,
            "price": values[3].text,
            "change": values[4].text,
            "volume": values[6].text,
            "market_cap": values[8].text,
            "pe_ratio": values[9].text
        }
        data.append(stock)
    # click next
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[data-testid="next-page-button"]'))
        )
    except:
        print(f"The \"next\" button is not clickable. We have navigated through all the pages.")
        break
    else:
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", next_btn)
        time.sleep(0.5)
        next_btn.click()
driver.quit()

In [None]:
data

In [5]:
len(data)

426

In [49]:
import pandas as pd
import numpy as np

In [51]:
stocks_df = (
    pd
    .DataFrame(data)
    .apply(lambda col: col.str.strip() if col.dtype == "object" else col)
    .assign(
        price= lambda df_:pd.to_numeric(df_.price),
        change = lambda df_: pd.to_numeric(df_.change.str.replace("+", "")),
        volume = lambda df_: pd.to_numeric(df_.volume.str.replace("M", "")),
        market_cap = lambda df_: df_.market_cap.apply(lambda val: float(val.replace("B", "")) if "B" in val else float(val.replace("T", "")) * 1000),
        pe_ratio = lambda df_: (
            df_
                .pe_ratio
                .replace("--", np.nan)
                .str.replace(",", "")
                .pipe(lambda col: pd.to_numeric(col))
        )
    )
    .rename(columns={
        "price": "price_usd",
        "volume": "volume_m",
        "market_cap": "market_cap_B"
    })
)

stocks_df

Unnamed: 0,name,symbol,price_usd,change,volume_m,market_cap_B,pe_ratio
0,ZIM Integrated Shipping Services Ltd.,ZIM,22.20,-4.19,161.888,4450.000,46.27
1,"Rivian Automotive, Inc.",RIVN,17.73,3.73,127.927,22.003,
2,Transocean Ltd.,RIG,6.54,0.51,98.175,7.205,
3,Ford Motor Company,F,14.12,0.27,87.400,56.331,
4,Plug Power Inc.,PLUG,1.89,0.07,84.314,2.630,
...,...,...,...,...,...,...,...
421,Deutsche Bank Aktiengesellschaft,DB,35.28,-1.39,5.020,72.113,
422,Robert Half Inc.,RHI,24.77,-0.62,5.016,2.520,19.11
423,Texas Instruments Incorporated,TXN,226.16,3.10,5.012,205.494,40.92
424,Teva Pharmaceutical Industries Limited,TEVA,33.98,0.06,5.011,39.574,28.03


In [53]:
stocks_df.to_csv("yahoo-stocks-data.csv", index=False)

In [1]:
class StocksScraper:
    def __init__(self, driver, timeout=10):
        self.driver = driver
        self.wait = WebDriverWait(self.driver, timeout=timeout)
        self.data = []

    def wait_for_the_page_to_load(self):
        page_title = self.driver.title
        try:
            self.wait.until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
        except:
            print(f"The page \"{page_title}\" did not get fully loaded within the given duration")
        else:
            print(f"The page \"{page_title}\" is Successfully loaded!")

    def access_url(self, url):
        self.driver.get(url)
        self.wait_for_the_page_to_load()

    def access_most_activeStocks(self):
        # hover to markets menu
        actions = ActionChains(self.driver)
        markets_menu = self.wait.until(
            EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/nav[1]/ol[1]/li[3]/a[1]/div[1]'))
        )

        actions.move_to_element(markets_menu).perform()

        # Navigation = click on stocks --> trending --> most active 
        stocks = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/nav[1]/ol[1]/li[3]/ol[1]/li[1]/a[1]'))
        )
        stocks.click()
        self.wait_for_the_page_to_load(driver, wait)

        trending_tickers = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/div[3]/main[1]/section[1]/section[1]/section[1]/section[1]/section[1]/div[1]/div[1]/div[1]/a[2]'))
        )
        trending_tickers.click()
        self.wait_for_the_page_to_load(driver, wait)

        active_stk = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/div[3]/main[1]/section[1]/section[1]/section[1]/section[1]/section[1]/div[1]/div[1]/div[1]/a[1]'))
        )
        active_stk.click()
        self.wait_for_the_page_to_load()

    

In [4]:
if __name__ == "main":
    driver = webdriver.Chrome()
    driver.maximize_window()

    url = 'https://finance.yahoo.com/'
    scraper = StocksScraper(driver, 5)

    scraper.access_url(url)
    