# Step-by-Step Procedural Code

## 1. Install Required Libraries

In [None]:
# Install required libraries
!pip install selenium pandas numpy openpyxl ipykernel==6.29.5

## 2. Import Libraries

In [19]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

## 3. Initialize the WebDriver

In [20]:
# Initialize the WebDriver
driver = webdriver.Chrome()
driver.maximize_window()

#### 4. Define Helper Functions

In [21]:
# Function to wait for the page to load
def wait_for_page_to_load(driver, wait):
    page_title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except:
        print(f'The page "{page_title}" did not get fully loaded within the given duration.\n')
    else:
        print(f'The page "{page_title}" is fully loaded.\n')

#### 5. Access Yahoo Finance

In [23]:
# Access Yahoo Finance
url = "https://finance.yahoo.com/"
driver.get(url)

# Wait for the page to load
wait = WebDriverWait(driver, 5)
wait_for_page_to_load(driver, wait)

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.



#### 6. Navigate to Most Active Stocks

In [24]:
# Hover over the Markets menu
actions = ActionChains(driver)
markets_menu = wait.until(
    EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
)
actions.move_to_element(markets_menu).perform()

# Click on Trending Tickers
trending_tickers = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
)
trending_tickers.click()
wait_for_page_to_load(driver, wait)

# Click on Most Active
most_active = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
)
most_active.click()
wait_for_page_to_load(driver, wait)

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.

The page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is fully loaded.



#### 7. Extract Data

In [25]:
# Extract data from all pages
data = []
while True:
    # Wait for the table to load
    wait.until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    for row in rows:
        values = row.find_elements(By.TAG_NAME, "td")
        stock = {
            "name": values[1].text,
            "symbol": values[0].text,
            "price": values[3].text,
            "change": values[4].text,
            "volume": values[6].text,
            "market_cap": values[8].text,
            "pe_ratio": values[9].text,
        }
        data.append(stock)

    # Click next
    try:
        next_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
        )
    except:
        print('The "next" button is not clickable. We have navigated through all the pages.')
        break
    else:
        next_button.click()
        time.sleep(1)

The "next" button is not clickable. We have navigated through all the pages.


#### 8. Clean and Save Data

In [26]:
# Clean and save data
stocks_df = (
    pd.DataFrame(data)
    .apply(lambda col: col.str.strip() if col.dtype == "object" else col)
    .assign(
        price=lambda df_: pd.to_numeric(df_.price),
        change=lambda df_: pd.to_numeric(df_.change.str.replace("+", "")),
        volume=lambda df_: pd.to_numeric(df_.volume.str.replace("M", "")),
        market_cap=lambda df_: df_.market_cap.apply(lambda val: float(val.replace("B", "")) if "B" in val else float(val.replace("T", "")) * 1000),
        pe_ratio=lambda df_: (
            df_.pe_ratio.replace("-", np.nan).str.replace(",", "").pipe(lambda col: pd.to_numeric(col))
        )
    )
    .rename(columns={
        "price": "price_usd",
        "volume": "volume_M",
        "market_cap": "market_cap_B"
    })
)

# Save to Excel
stocks_df.to_excel("yahoo_finance_stocks.xlsx", index=False)
print("Data saved to yahoo_finance_stocks.xlsx")

Data saved to yahoo_finance_stocks.xlsx


#### 9. Close the Browser

In [27]:
# Close the browser
driver.quit()

#  Define the StocksScraper Class

In [None]:
class StocksScraper:
    def __init__(self, driver, timeout=10):
        self.driver = driver
        self.wait = WebDriverWait(self.driver, timeout=timeout)
        self.data = []

    # Wait while webpage loads
    def wait_for_page_to_load(self):
        page_title = self.driver.title
        try:
            self.wait.until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
        except:
            print(f'The page "{page_title}" did not get fully loaded within the given duration.\n')
        else:
            print(f'The page "{page_title}" is fully loaded.\n')

    # Access main URL
    def access_url(self, url):
        self.driver.get(url)
        self.wait_for_page_to_load()

    # Access most active stocks webpage
    def access_most_active_stocks(self):
        # Hover to markets menu
        actions = ActionChains(self.driver)
        markets_menu = self.wait.until(
            EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
        )
        actions.move_to_element(markets_menu).perform()

        # Click on Trending Tickers
        trending_tickers = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
        )
        trending_tickers.click()
        self.wait_for_page_to_load()

        # Click on Most Active
        most_active = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
        )
        most_active.click()
        self.wait_for_page_to_load()

    # Extract data from all pages
    def extract_stocks_data(self):
        while True:
            self.wait.until(
                EC.presence_of_element_located((By.TAG_NAME, "table"))
            )
            rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
            for row in rows:
                values = row.find_elements(By.TAG_NAME, "td")
                stock = {
                    "name": values[1].text,
                    "symbol": values[0].text,
                    "price": values[3].text,
                    "change": values[4].text,
                    "volume": values[6].text,
                    "market_cap": values[8].text,
                    "pe_ratio": values[9].text,
                }
                self.data.append(stock)

            # Click next
            try:
                next_button = self.wait.until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
                )
            except:
                print('The "next" button is not clickable. We have navigated through all the pages.')
                break
            else:
                next_button.click()
                time.sleep(1)

    # Clean and save data
    def clean_and_save_data(self, filename="yahoo_finance_stocks"):
        stocks_df = (
            pd.DataFrame(self.data)
            .apply(lambda col: col.str.strip() if col.dtype == "object" else col)
            .assign(
                price=lambda df_: pd.to_numeric(df_.price),
                change=lambda df_: pd.to_numeric(df_.change.str.replace("+", "")),
                volume=lambda df_: pd.to_numeric(df_.volume.str.replace("M", "")),
                market_cap=lambda df_: df_.market_cap.apply(lambda val: float(val.replace("B", "")) if "B" in val else float(val.replace("T", "")) * 1000),
                pe_ratio=lambda df_: (
                    df_.pe_ratio.replace("-", np.nan).str.replace(",", "").pipe(lambda col: pd.to_numeric(col))
                )
            )
            .rename(columns={
                "price": "price_usd",
                "volume": "volume_M",
                "market_cap": "market_cap_B"
            })
        )
        stocks_df.to_excel(f"{filename}.xlsx", index=False)
        print(f"Data saved to {filename}.xlsx")

## Initialize and Run the Scraper

In [None]:
# Initialize the scraper
driver = webdriver.Chrome()
driver.maximize_window()

url = "https://finance.yahoo.com/"
scraper = StocksScraper(driver, 5)

# Scrape data
scraper.access_url(url)
scraper.access_most_active_stocks()
scraper.extract_stocks_data()
scraper.clean_and_save_data("yahoo_finance_stocks")

# Close the browser
driver.quit()