In [3]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

# Initialize an empty DataFrame to store results
df2 = pd.DataFrame()

# Setup Chrome options for headless mode (optional)
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment to run Chrome in headless mode

# Setup WebDriver with Chrome, specifying service and options
service = Service()  # Specify the path to chromedriver if necessary
driver = webdriver.Chrome(service=service, options=chrome_options)

# Set timeouts for driver actions to avoid hanging
driver.implicitly_wait(10)  # Implicit wait time
driver.set_script_timeout(120)  # Timeout for scripts running in the browser
driver.set_page_load_timeout(90)  # Page load timeout

# Placeholder for checking if the link was successfully found
check_link = None

# Try-except block to handle exceptions gracefully
try:
    # Navigate to the Yahoo Finance page for AAPL news
    driver.get("https://finance.yahoo.com/quote/AAPL/news?p=AAPL")

    # Initialize WebDriverWait for managing timeouts and waits
    wait = WebDriverWait(driver, 10)
    
    # Try to find the news link using one CSS selector, and if not found, try another
    try:
        link = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.clamp.svelte-13zydns')))
    except TimeoutException:
        try:
            link = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.StretchedBox')))
        except TimeoutException:
            print("Link not found with either class. Exiting.")
            driver.quit()
            exit()

    # Store the link text for checking and then click the link to navigate
    check_link = link.text
    link.click()

    # Attempt to find and click a "Read More" button to expand the content, trying different selectors if necessary
    try:
        read_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "readmoreButtonText"))
        )
        read_more_button.click()
    except TimeoutException:
        try:
            read_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, ".link.rapid-noclick-resp.caas-button.collapse-button"))
            )
            read_more_button.click()
        except TimeoutException:
            print("No 'Read More' button found with either class name.")

    # Collect all paragraph elements from the page that are not empty and have more than 15 words
    paragraphs = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
    final_par = ' '.join([par.text for par in paragraphs if par.text != '' and len(par.text.split()) > 15])

    # Initialize the tokenizer and model for sentiment analysis
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    # Tokenize the text and predict sentiment
    inputs = tokenizer(final_par, padding=True, truncation=True, max_length=512, return_tensors='pt')
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Extract probabilities for each sentiment category
    positive, negative, neutral = predictions[:, 0].tolist(), predictions[:, 1].tolist(), predictions[:, 2].tolist()

    # Create a DataFrame with the results and append it to the existing DataFrame
    table = {'Headline': final_par, "Positive": positive, "Negative": negative, "Neutral": neutral}
    df = pd.DataFrame(table, index=[0])
    df2 = pd.concat([df2, df], ignore_index=True)

# Ensure the WebDriver is closed properly in the end
finally:
    driver.quit()
    
# Display the final DataFrame
df2


Unnamed: 0,Headline,Positive,Negative,Neutral
0,The benchmark index has hit new record highs t...,0.952614,0.022618,0.024768
