# imports

In [35]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

# Data Cleaning
import pandas as pd
import numpy as np

# Extracting the data

In [8]:
# ----- SCRAPING THE DATA -----

driver = webdriver.Chrome()
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# function to check if webpage is fully loaded
def wait_for_page_to_load(driver, wait):
	page_title = driver.title
	try:
		wait.until(
			lambda d: d.execute_script("return document.readyState") == "complete"
		)
	except:
		print(f"The page \"{page_title}\" did not get fully loaded within the given duration.\n")
	else:
		print(f"The page \"{page_title}\" is fully loaded.\n")


url = "https://finance.yahoo.com/"
driver.get(url)
wait_for_page_to_load(driver, wait)

# hovering on Markets menu
actions = ActionChains(driver)
markets_menu = wait.until(
	EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
)
actions.move_to_element(markets_menu).perform()

# click on Trending Tickers
trending_tickers = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
)
trending_tickers.click()
wait_for_page_to_load(driver, wait)

# click on Most Active
most_active = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
)
most_active.click()
wait_for_page_to_load(driver, wait)

# scraping the data
data = []
while True:
	# scraping data from the webpage
	wait.until(
		EC.presence_of_element_located((By.TAG_NAME, "table"))
	)
	rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
	for row in rows:
		values = row.find_elements(By.TAG_NAME, "td")
		stock = {
			"name": values[1].text,
			"symbol": values[0].text,
			"price": values[3].text,
			"change": values[4].text,
			"volume": values[6].text,
			"market_cap": values[8].text,
			"pe_ratio": values[9].text,
		}
		data.append(stock)

	# click next
	try:
		next_button = wait.until(
			EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
		)
	except:
		print("The \"next\" button is not clickable. We have navigated through all the pages.")
		break
	else:
		next_button.click()
		time.sleep(1)

driver.quit()



The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.

The page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is fully loaded.

The "next" button is not clickable. We have navigated through all the pages.


# Cleaning the data

In [16]:
data

[{'name': 'NVIDIA Corporation',
  'symbol': 'NVDA',
  'price': '112.20',
  'change': '+1.49',
  'volume': '211.051M',
  'market_cap': '2.738T',
  'pe_ratio': '38.16'},
 {'name': 'Ford Motor Company',
  'symbol': 'F',
  'price': '9.45',
  'change': '-0.26',
  'volume': '124.724M',
  'market_cap': '37.578B',
  'pe_ratio': '6.47'},
 {'name': 'Palantir Technologies Inc.',
  'symbol': 'PLTR',
  'price': '98.40',
  'change': '+5.78',
  'volume': '116.263M',
  'market_cap': '230.783B',
  'pe_ratio': '517.89'},
 {'name': 'Bank of America Corporation',
  'symbol': 'BAC',
  'price': '37.99',
  'change': '+1.32',
  'volume': '77.147M',
  'market_cap': '288.83B',
  'pe_ratio': '11.34'},
 {'name': 'Tesla, Inc.',
  'symbol': 'TSLA',
  'price': '254.11',
  'change': '+1.76',
  'volume': '78.19M',
  'market_cap': '817.35B',
  'pe_ratio': '125.18'},
 {'name': 'Lucid Group, Inc.',
  'symbol': 'LCID',
  'price': '2.4400',
  'change': '-0.0900',
  'volume': '75.353M',
  'market_cap': '7.397B',
  'pe_ratio

In [18]:
len(data)

241

In [57]:
stocks_df = (
    pd.DataFrame(data)
    .apply(lambda col: col.str.strip() if col.dtype == 'object' else col)
    .assign(
        price = lambda df_: pd.to_numeric(df_.price),
        change = lambda df_: pd.to_numeric(df_.change.str.replace("+","")),
        volume = lambda df_: pd.to_numeric(df_.volume.str.replace("M","")),
        market_cap = lambda df_: df_.market_cap.apply(lambda val: float(val.replace("B","")) if "B" in val else float(val.replace("T","")) * 1000),
        pe_ratio = lambda df_: ( df_.pe_ratio.replace("-",np.nan).str.replace(",","").pipe(lambda col: pd.to_numeric(col))
                               )
    ).rename(columns={
        "price": "price_usd",
        "volume": "volume_M",
        "market_cap":"market_cap_B"
    })
)
stocks_df.to_csv("yahoo-most-active-stocks-data.csv")

In [55]:
print(stocks_df.dtypes)

name             object
symbol           object
price_usd       float64
change          float64
volume_M        float64
market_cap_B    float64
pe_ratio        float64
dtype: object
