**Yahoo Stock Data Scraping and Cleaning Process**

This code automates the process of scraping stock data from Yahoo Finance, focusing on "Trending Tickers" and "Most Active" stocks.

The steps include:--

1. **Accessing the URL:** Opens the browser, waits for the page to load.

2. **Navigating the Menu:** Hovering and clicking through the "Trending Tickers" and "Most Active" sections.

3. **Extracting Data:** Scrapes stock information (symbol, name, price, etc.) and navigates through multiple pages.

4. **Data Cleaning:** Cleans the scraped data, handles missing values, converts columns to numeric types, and exports it to an Excel file.

This process is automated using Selenium for web interaction and Pandas for data cleaning and export.

In [None]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver                                    # Main Selenium module for browser automation
from selenium.webdriver.common.by import By                       # Locates elements on the page
from selenium.webdriver.common.keys import Keys                   # Handles keyboard actions (e.g., pressing Enter)
from selenium.webdriver.support.select import Select              # Manages drop-down menus in forms
from selenium.webdriver.chrome.options import Options             # Configures Chrome options (e.g., headless mode)
from selenium.webdriver.support.ui import WebDriverWait           # Waits for elements to be ready before interaction
from webdriver_manager.chrome import ChromeDriverManager          # Automatically manages ChromeDriver installation
from selenium.webdriver.common.action_chains import ActionChains  # Simulates complex user actions (e.g., mouse movements)
from selenium.webdriver.support import expected_conditions as EC  # Waits for specific conditions to occur before continuing

driver = webdriver.Chrome()
driver.maximize_window()
#explicit wait
wait = WebDriverWait(driver, 5)

#function to load for page
def wait_for_page_load(driver , wait):
  title = driver.title
  try:
    wait.until(lambda c : c.execute_script('return document.readyState') == 'complete') # Check if the page has fully loaded
  except:
    print(f"Page {title} is not Loaded completely.\n")
  else:
    print(f"Page {title} loaded succesfully,\n")

url= "https://finance.yahoo.com/"
driver.get(url)
wait_for_page_load(driver,wait)

#Hovering the Market Menu
actions = ActionChains(driver)   # Set up actions for the driver
market_menu = wait.until(
    EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/header/div/div/div/div[4]/div/div/ul/li[3]/a/span')))
actions.move_to_element(market_menu).perform()


#Check if the trending tickers link is clickable, then click it
trending_tickers = wait.until(
     EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/header/div/div/div/div[4]/div/div/ul/li[3]/div/ul/li[4]/a/div')))
trending_tickers.click()
wait_for_page_load(driver,wait)


#Check if the most active link is clickable, then click it
most_active = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/main/section/section/section/article/section[1]/div/nav/ul/li[1]/a/span')))
most_active.click()
wait_for_page_load(driver,wait)


#Navigating the stockes pages
stocks_data = []
while True:
  #1. scrap the data
  # Wait until the table is present
  wait.until(
      EC.presence_of_element_located((By.TAG_NAME, "table")))
  rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr") #Find all rows in the table body
  for row in rows:   #Loop through each row in the table
    values = row.find_elements(By.TAG_NAME, "td")
    stocks = {
              "symbol" : values[0].text,     # Stock symbol
              "name" : values[1].text,       # Stock name
              "price": values[4].text,       # Stock price
              "change": values[4].text,      # Stock price change
              "volume": values[6].text,      # Stock volume
              "market_cap": values[8].text,  # Market capitalization
              "pe_ratio" : values[9].text,   # Price-to-earnings ratio
    }
    stocks_data.append(stocks)
  #2. check if next arrow butoon clickable
  try:
    next_button  = wait.until(
        EC.element_to_be_clickable((By.XPATH , '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]')))
  except:
    print(f"the next button is not clickable. we have nevigated throgh all the pages")
    break # Exit the loop if no next page is available
  else:
    next_button.click()
    time.sleep(1)
driver.quit()

# CLEANING THE DATA>>>>>>>>>>>>>>>>
df = (pd
    .DataFrame(stocks_data)
    .apply(lambda col : col.str.strip() if col.dtype == "object" else col)     #Remove leading/trailing spaces from string columns
    .assign(price =  lambda p : pd.to_numeric(p.price),                        #Convert the 'price' column to numeric
            change = lambda c : pd.to_numeric(c.change.str.replace("+","")),   #Clean and convert 'change' column to numeric
            volume = lambda v : pd.to_numeric(v.volume.str.replace("M", "")),  #Convert 'volume' to numeric, removing 'M' suffix
            market_cap = lambda x: x.market_cap.apply
                        ( lambda val :
                     float(val.replace("B" , "")) if "B" in val        #Handle 'B' (billion) values
                else float(val.replace("T", ""))  *1000 if "T" in val  #Handle 'T' (trillion) values
                else float(val.replace("M", "")) * 0.001 if "M" in val #Handle 'M' (million) values
                else val),
            pe_ratio = lambda pe : (pe  # Clean and convert 'pe_ratio'
                                      .pe_ratio
                                      .replace("-", np.nan)
                                      .str.replace(",","")
                                      .pipe(lambda col : pd.to_numeric(col))))
    .rename(columns = {
        "price" : "Price_USDT",
        "volume" : "Volume_M",
        "market_cap" : "Market_Cap_B"})
   )
df
#df.price.str.extract(r"([0-9.])", expand = False).unique()  >> Uncomment to check unique values in the 'price' column apart from numbers and "."
df.to_excel("Yahoo stocks scraped data by bharat.xlsx", index = False)

**Object-oriented programming** (OOP) is important for organizing code, making it more modular, reusable, and maintainable. By encapsulating each step—like **accessing** the URL, **navigating** stocks, **extracting** data, and **cleaning** it—into separate methods or classes, **OOP improves structure, reduces redundancy**, and simplifies future modifications or debugging, even if the initial code isn't OOP-based.


**Divided Completed Process in 4 Steps Which Will Be Methods of Scraper Class:-**


**| Accessing the Main URL |**

1. Open the browser and maximize the window.

2. Initialize the Explicit Wait instance.

3. Access the main URL and wait for the page to load.


**| Accessing Most Active Stocks |**

4. Hover over the "Markets Menu."

5. Click on "Trending Tickers."

6. Click on "Most Active."


**| Extracting the Stocks Data |**

7. Scrape the data as dictionaries.

8. Navigate until the last page for more data.

9. Store the extracted data in a list.

10. Close the browser.

**| Clean & Save the Data |**

11. Clean the extracted data.

12. Export the cleaned data as an Excel file.




In [None]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver                                    # Main Selenium module for browser automation
from selenium.webdriver.common.by import By                       # Locates elements on the page
from selenium.webdriver.common.keys import Keys                   # Handles keyboard actions (e.g., pressing Enter)
from selenium.webdriver.support.select import Select              # Manages drop-down menus in forms
from selenium.webdriver.chrome.options import Options             # Configures Chrome options (e.g., headless mode)
from selenium.webdriver.support.ui import WebDriverWait           # Waits for elements to be ready before interaction
from webdriver_manager.chrome import ChromeDriverManager          # Automatically manages ChromeDriver installation
from selenium.webdriver.common.action_chains import ActionChains  # Simulates complex user actions (e.g., mouse movements)
from selenium.webdriver.support import expected_conditions as EC  # Waits for specific conditions to occur before continuing

class Scraping:
  def __init__(self, driver, timeout = 10):
    self.driver = driver
    self.wait = WebDriverWait(self.driver, timeout = timeout)
    self.stocks_data = []

  #Method-1------------------------------------------------>
  def wait_for_page_load(self):
    page_of_title = self.driver.title
    try:
      self.wait.until(lambda c : c.execute_script('return document.readyState') == 'complete')
    except:
      print(f"Page {page_of_title} is not Loaded completely.\n")
    else:
      print(f"Page {page_of_title} loaded succesfully.\n")

  def Access_url(self, url):
    self.driver.get(url)
    self.wait_for_page_load()

  #Method 2------------------------------------------------>
  def Accessing_most_active_stocks(self):

    #Hovering the Market Menu by using XPATH of Market Menu
    actions = ActionChains(self.driver)
    market_menu = self.wait.until(
      EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/header/div/div/div/div[4]/div/div/ul/li[3]/a/span')))
    actions.move_to_element(market_menu).perform()

    #Click on trending clickers by using XPATH of Trending clickers
    trending_tickers = self.wait.until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/header/div/div/div/div[4]/div/div/ul/li[3]/div/ul/li[4]/a/div')))
    trending_tickers.click()
    self.wait_for_page_load()

    #Click on most active by using XPATH of most active
    most_active = self.wait.until(
      EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/main/section/section/section/article/section[1]/div/nav/ul/li[1]/a/span')))
    most_active.click()
    self.wait_for_page_load()

  #Method 3-------------------------------------------------->
  def Extracting_data(self):
    while True:

      #scrap the data from webpage
      self.wait.until(
          EC.presence_of_element_located((By.TAG_NAME, "table")) # table tag name
                      )
      rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr") # these are tag hieararchy
      for row in rows:
        values = row.find_elements(By.TAG_NAME, "td") # tag name of row
        stocks = {
                  "symbol" : values[0].text,
                  "name" : values[1].text,
                  "price": values[3].text,
                  "change": values[4].text,
                  "volume": values[6].text,
                  "market_cap": values[8].text,
                  "PE_Ratio" : values[9].text,
                 }
        self.stocks_data.append(stocks)

      #check if next arrow butoon clickable by providing ">" this button XPATH
      try:
        next_button  = self.wait.until(
            EC.element_to_be_clickable((By.XPATH , '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
                                  )
      except:
        print(f"the next button is not clickable. we have nevigated throgh all pages")
        break
      else:
        next_button.click()
        time.sleep(1)

  #Method 4-------------------------------------------------------->
  def Data_Cleaning(self, filename="temp"):
      df = (pd
        .DataFrame(stocks_data)
        .apply(lambda col : col.str.strip() if col.dtype == "object" else col)     #Remove leading/trailing spaces from string columns
        .assign(price =  lambda p : pd.to_numeric(p.price),                        #Convert the 'price' column to numeric
                change = lambda c : pd.to_numeric(c.change.str.replace("+","")),   #Clean and convert 'change' column to numeric
                volume = lambda v : pd.to_numeric(v.volume.str.replace("M", "")),  #Convert 'volume' to numeric, removing 'M' suffix
                market_cap = lambda x: x.market_cap.apply
                            ( lambda val :
                         float(val.replace("B" , "")) if "B" in val        #Handle 'B' (billion) values
                    else float(val.replace("T", ""))  *1000 if "T" in val  #Handle 'T' (trillion) values
                    else float(val.replace("M", "")) * 0.001 if "M" in val #Handle 'M' (million) values
                    else val),
                pe_ratio = lambda pe : (pe  # Clean and convert 'pe_ratio'
                                          .pe_ratio
                                          .replace("-", np.nan)
                                          .str.replace(",","")
                                          .pipe(lambda col : pd.to_numeric(col))))
        .rename(columns = {
            "price" : "Price_USDT",
            "volume" : "Volume_M",
            "market_cap" : "Market_Cap_B"})
           )
#df.price.str.extract(r"([0-9.])", expand = False).unique()  >> Uncomment to check unique values in the 'price' column apart from numbers and "."
      df.to_excel(f"{filename}.xlsx", index = False)

if __name__ == "__main__":      # we are implemeting code of ourself module
  driver = webdriver.Chrome()
  driver.maximize_window()
  url = 'https://finance.yahoo.com/'
  scraper = Scraping(driver , 5)
  scraper.Access_url(url)
  scraper.Accessing_most_active_stocks()
  scraper.Extracting_data()
  scraper.Data_Cleaning("Yahoo_stocks_scraped_data_by_Bharat")
  driver.quit()
#END---------------------xx--------------------xx-------------------------xx-------------------------xx