<a href="https://colab.research.google.com/github/boyuedong/Finance-news-sentiment-analysis/blob/main/sentiment_analysis_nonbinary_categorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install required packages (if not already installed)
!pip install requests beautifulsoup4 nltk transformers torch

# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import nltk

# Download NLTK VADER lexicon (optional if you want to compare with traditional sentiment)
nltk.download('vader_lexicon')

# Import Hugging Face transformers for AI-based sentiment analysis
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load the transformer model for sentiment analysis.
# We use the "cardiffnlp/twitter-roberta-base-sentiment" model which outputs three labels: Negative, Neutral, Positive.
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# We set up the pipeline to return all scores
ai_sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Define a helper function to map the probability difference to our sentiment value
def compute_sentiment(scores, threshold=0.2):
    # scores is a list of dictionaries containing label and score
    # Create a mapping from label (in lowercase) to its score.
    score_map = {item['label'].lower(): item['score'] for item in scores}
    pos = score_map.get("positive", 0)
    neg = score_map.get("negative", 0)
    # Compute difference between positive and negative scores
    diff = pos - neg
    if diff > threshold:
        return 1    # Positive
    elif diff < -threshold:
        return -1   # Negative
    else:
        return 0    # Neutral

# Define a dictionary of tickers with associated industry categories
tickers_info = {
    'AAPL': 'Technology',
    'JPM': 'Financial Services',
    'XOM': 'Energy',
    'WMT': 'Retail',
    'PFE': 'Healthcare'
}

# Function to scrape Yahoo Finance news headlines for a given ticker
def scrape_yahoo_finance_news(ticker):
    url = f"https://finance.yahoo.com/quote/{ticker}?p={ticker}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise exception for HTTP errors
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    headlines = []

    # Yahoo Finance news headlines are often contained within <h3> tags.
    for h3 in soup.find_all('h3'):
        headline = h3.get_text(strip=True)
        # Basic filter to remove unwanted text (ensure headline is long enough)
        if headline and len(headline) > 10:
            headlines.append(headline)

    return headlines

# Function to process scraped headlines using the AI sentiment model
def analyze_headlines_with_ai(ticker, industry):
    headlines = scrape_yahoo_finance_news(ticker)
    results = []

    for headline in headlines:
        try:
            # Retrieve all sentiment scores for the headline.
            # The pipeline returns a list of dictionaries for each headline.
            result = ai_sentiment_pipeline(headline, return_all_scores=True)[0]
            # result is a list like: [{'label': 'Negative', 'score': 0.1}, {'label': 'Neutral', 'score': 0.7}, {'label': 'Positive', 'score': 0.2}]
            sentiment_value = compute_sentiment(result, threshold=0.2)
            # For display, also determine the dominant label based on max score.
            dominant_label = max(result, key=lambda x: x['score'])['label']
        except Exception as e:
            print(f"Error analyzing headline: {headline}\n{e}")
            dominant_label = "Neutral"
            sentiment_value = 0

        # Use current time as the publish time placeholder (since scraped pages may not include a timestamp)
        published_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        results.append({
            'Ticker': ticker,
            'Industry': industry,
            'Published': published_time,
            'Headline': headline,
            'AI_DominantLabel': dominant_label,
            'AI_Sentiment': sentiment_value
        })
    return results

# Loop through each ticker and gather AI sentiment analysis results
all_results = []
for ticker, industry in tickers_info.items():
    news_results = analyze_headlines_with_ai(ticker, industry)
    all_results.extend(news_results)

# Create a DataFrame to view the results
df = pd.DataFrame(all_results)
df = df[['Ticker', 'Industry', 'Published', 'Headline', 'AI_DominantLabel', 'AI_Sentiment']]

# Display the DataFrame
print(df.head(20))



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Device set to use cpu


   Ticker            Industry            Published  \
0    AAPL          Technology  2025-02-20 10:26:24   
1    AAPL          Technology  2025-02-20 10:26:24   
2    AAPL          Technology  2025-02-20 10:26:24   
3    AAPL          Technology  2025-02-20 10:26:24   
4    AAPL          Technology  2025-02-20 10:26:24   
5    AAPL          Technology  2025-02-20 10:26:24   
6    AAPL          Technology  2025-02-20 10:26:24   
7    AAPL          Technology  2025-02-20 10:26:24   
8    AAPL          Technology  2025-02-20 10:26:24   
9    AAPL          Technology  2025-02-20 10:26:24   
10   AAPL          Technology  2025-02-20 10:26:25   
11   AAPL          Technology  2025-02-20 10:26:25   
12   AAPL          Technology  2025-02-20 10:26:25   
13   AAPL          Technology  2025-02-20 10:26:25   
14   AAPL          Technology  2025-02-20 10:26:25   
15   AAPL          Technology  2025-02-20 10:26:25   
16   AAPL          Technology  2025-02-20 10:26:25   
17   AAPL          Technolog