In [2]:
# Import necessary libraries for web scraping, text processing, and data handling
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import time
import os

# Set up Chrome options for headless mode (optional, for running without a GUI)
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment to enable headless mode

# Attempt to load the text of the last processed link to avoid repetition
try:
    with open('previous_link_text.txt', 'r') as file:
        previous_link_text = file.read().strip()
except FileNotFoundError:
    previous_link_text = None

# Define file path for saving the DataFrame as CSV
df_file_path = 'scraped_articles.csv'

# Check if the CSV file exists; if so, load it. Otherwise, initialize an empty DataFrame
if os.path.exists(df_file_path):
    df2 = pd.read_csv(df_file_path)
else:
    df2 = pd.DataFrame(columns=['Headline', 'Positive', 'Negative', 'Neutral'])

# Flag to track if the process was interrupted
interrupted = False

# Prompt user for stock ticker input
stock = input('What stock do you want to check the news for? ').strip()

# Main try block to handle the workflow
try:
    while True:
        # Use Chrome WebDriver with specified options
        with webdriver.Chrome(options=chrome_options) as driver:
            # Set timeouts for the driver
            driver.implicitly_wait(10)
            driver.set_script_timeout(120)
            driver.set_page_load_timeout(90)

            try:
                # Navigate to Yahoo Finance news section for the specified stock
                driver.get(f"https://finance.yahoo.com/quote/{stock}/news?p={stock}")
                wait = WebDriverWait(driver, 10)

                # Try to find the news link using CSS selectors
                try:
                    link = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.clamp.svelte-13zydns')))
                except TimeoutException:
                    try:
                        link = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.StretchedBox')))
                    except TimeoutException:
                        print("Link not found with either class. Exiting.")
                        continue

                # Check if the current link has already been processed
                current_link_text = link.text
                if current_link_text == previous_link_text:
                    print("No new article found. Waiting for 5 minutes before checking again.")
                    for _ in range(30):  # Wait in 10-second intervals for 5 minutes
                        if os.path.exists("stop_signal.txt"):
                            print("Stop signal file detected. Exiting loop.")
                            os.remove("stop_signal.txt")
                            interrupted = True
                            break
                        time.sleep(10)
                    if interrupted:
                        break
                    continue

                # Update and save the last processed link text
                previous_link_text = current_link_text
                with open('previous_link_text.txt', 'w') as file:
                    file.write(previous_link_text)

                # Click the link to navigate to the article
                print(previous_link_text)
                link.click()

                # Try to expand the article content if "Read More" buttons are present
                try:
                    read_more_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "readmoreButtonText")))
                    read_more_button.click()
                except TimeoutException:
                    try:
                        read_more_button = wait.until(
                            EC.element_to_be_clickable((By.CSS_SELECTOR, ".link.rapid-noclick-resp.caas-button.collapse-button"))
                        )
                        read_more_button.click()
                    except TimeoutException:
                        print("No 'Read More' button found with either class name.")

                # Collect paragraphs from the article
                paragraphs = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
                final_par = ' '.join([par.text for par in paragraphs if par.text != '' and len(par.text.split()) > 15])

                # Initialize tokenizer and model for sentiment analysis
                tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
                model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

                # Tokenize the article and predict sentiment
                inputs = tokenizer(final_par, padding=True, truncation=True, max_length=512, return_tensors='pt')
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

                # Extract sentiment scores
                positive, negative, neutral = predictions[:, 0].tolist(), predictions[:, 1].tolist(), predictions[:, 2].tolist()

                # Update the DataFrame with new article information
                table = {'Headline': final_par, "Positive": positive, "Negative": negative, "Neutral": neutral}
                df = pd.DataFrame(table, index=[0])
                df2 = pd.concat([df2, df], ignore_index=True)

            except Exception as e:
                print(f"An error occurred: {e}")

            # Check for a stop signal to end the loop
            if os.path.exists("stop_signal.txt"):
                print("Stop signal file detected. Exiting loop.")
                os.remove("stop_signal.txt")
                break

            # Wait before checking for new articles again
            print("Waiting a short while before checking for new articles again...")
            time.sleep(60)

finally:
    # Save the updated link text and DataFrame before exiting
    with open('previous_link_text.txt', 'w') as file:
        file.write(previous_link_text if previous_link_text else "")
    df2.to_csv('scraped_articles.csv', index=False)
    print(df2)


No new article found. Waiting for 5 minutes before checking again.
Stop signal file detected. Exiting loop.
                                            Headline  Positive  Negative  \
0  The benchmark index has hit new record highs t...  0.952614  0.022618   

    Neutral  
0  0.024768  
