# Import Libraries

In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
from typing import List, Dict, Any, Optional
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define Functions

In [3]:
class StockSentimentAnalyzer:
    def __init__(self, news_api_key: str):
        """
        Initialize the sentiment analyzer with NewsAPI key and HuggingFace model.
        
        Args:
            news_api_key (str): Your NewsAPI key
        """
        self.news_api_key = news_api_key
        self.base_url = "https://newsapi.org/v2/everything"
        
        # Initialize HuggingFace sentiment analysis pipeline
        # Using FinBERT model specifically trained on financial text
        try:
            self.sentiment_analyzer = pipeline(
                "sentiment-analysis",
                model="ProsusAI/finbert",
                tokenizer="ProsusAI/finbert"
            )
            logger.info("FinBERT model loaded successfully")
        except Exception as e:
            logger.warning(f"FinBERT model failed to load: {e}. Using default model.")
            # Fallback to general sentiment model
            self.sentiment_analyzer = pipeline(
                "sentiment-analysis",
                model="cardiffnlp/twitter-roberta-base-sentiment-latest"
            )
    
    def get_news_articles(self, ticker_symbol: str, company_name: str = None, 
                         days_back: int = 7, page_size: int = 100) -> List[Dict]:
        """
        Fetch news articles for a given stock ticker.
        
        Args:
            ticker_symbol (str): Stock ticker symbol (e.g., 'AAPL')
            company_name (str): Company name for better search results
            days_back (int): Number of days to look back for news
            page_size (int): Number of articles to fetch (max 100)
            
        Returns:
            List[Dict]: List of news articles
        """
        # Calculate date range
        to_date = datetime.now()
        from_date = to_date - timedelta(days=days_back)
        
        # Construct search query
        if company_name:
            query = f'"{ticker_symbol}" OR "{company_name}"'
        else:
            query = f'"{ticker_symbol}"'
        
        # API parameters
        params = {
            'q': query,
            'apiKey': self.news_api_key,
            'language': 'en',
            'sortBy': 'relevancy',
            'pageSize': min(page_size, 100),
            'from': from_date.strftime('%Y-%m-%d'),
            'to': to_date.strftime('%Y-%m-%d')
        }
        
        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()
            
            data = response.json()
            
            if data['status'] == 'ok':
                articles = data.get('articles', [])
                logger.info(f"Fetched {len(articles)} articles for {ticker_symbol}")
                return articles
            else:
                logger.error(f"API error: {data.get('message', 'Unknown error')}")
                return []
                
        except requests.exceptions.RequestException as e:
            logger.error(f"Request failed for {ticker_symbol}: {e}")
            return []
        except Exception as e:
            logger.error(f"Unexpected error for {ticker_symbol}: {e}")
            return []
    
    def analyze_text_sentiment(self, text: str) -> Dict[str, Any]:
        """
        Analyze sentiment of a single text using HuggingFace model.
        
        Args:
            text (str): Text to analyze
            
        Returns:
            Dict: Sentiment analysis results
        """
        try:
            # Truncate text if too long (BERT models have token limits)
            max_length = 512
            if len(text.split()) > max_length:
                text = ' '.join(text.split()[:max_length])
            
            # Get sentiment prediction
            result = self.sentiment_analyzer(text)[0]
            
            # Normalize labels (different models use different labels)
            label = result['label'].upper()
            score = result['score']
            
            # Map different model outputs to consistent labels
            if 'POSITIVE' in label or 'POS' in label:
                sentiment = 'positive'
            elif 'NEGATIVE' in label or 'NEG' in label:
                sentiment = 'negative'
            else:
                sentiment = 'neutral'
            
            return {
                'sentiment': sentiment,
                'confidence': score,
                'raw_label': result['label']
            }
            
        except Exception as e:
            logger.error(f"Sentiment analysis failed: {e}")
            return {
                'sentiment': 'neutral',
                'confidence': 0.0,
                'raw_label': 'error'
            }
    
    def analyze_articles_sentiment(self, articles: List[Dict]) -> Dict[str, Any]:
        """
        Analyze sentiment for a list of news articles.
        
        Args:
            articles (List[Dict]): List of news articles
            
        Returns:
            Dict: Aggregated sentiment analysis
        """
        if not articles:
            return {
                'overall_sentiment': 'neutral',
                'sentiment_score': 0.0,
                'positive_count': 0,
                'negative_count': 0,
                'neutral_count': 0,
                'total_articles': 0,
                'confidence': 0.0
            }
        
        sentiments = []
        sentiment_scores = []
        
        for article in articles:
            # Combine title and description for analysis
            text_to_analyze = ""
            if article.get('title'):
                text_to_analyze += article['title'] + " "
            if article.get('description'):
                text_to_analyze += article['description']
            
            if text_to_analyze.strip():
                sentiment_result = self.analyze_text_sentiment(text_to_analyze)
                sentiments.append(sentiment_result['sentiment'])
                
                # Convert sentiment to numerical score
                if sentiment_result['sentiment'] == 'positive':
                    score = sentiment_result['confidence']
                elif sentiment_result['sentiment'] == 'negative':
                    score = -sentiment_result['confidence']
                else:
                    score = 0.0
                
                sentiment_scores.append(score)
        
        # Calculate aggregated metrics
        positive_count = sentiments.count('positive')
        negative_count = sentiments.count('negative')
        neutral_count = sentiments.count('neutral')
        total_articles = len(sentiments)
        
        # Calculate overall sentiment score (average)
        overall_score = np.mean(sentiment_scores) if sentiment_scores else 0.0
        
        # Determine overall sentiment
        if overall_score > 0.1:
            overall_sentiment = 'positive'
        elif overall_score < -0.1:
            overall_sentiment = 'negative'
        else:
            overall_sentiment = 'neutral'
        
        # Calculate confidence as standard deviation (lower = more confident)
        confidence = 1 - (np.std(sentiment_scores) if len(sentiment_scores) > 1 else 0)
        confidence = max(0, min(1, confidence))  # Clamp between 0 and 1
        
        return {
            'overall_sentiment': overall_sentiment,
            'sentiment_score': round(overall_score, 3),
            'positive_count': positive_count,
            'negative_count': negative_count,
            'neutral_count': neutral_count,
            'total_articles': total_articles,
            'confidence': round(confidence, 3)
        }
    
    def get_stock_sentiment(self, ticker_symbol: str, company_name: str = None) -> Dict[str, Any]:
        """
        Get complete sentiment analysis for a stock.
        
        Args:
            ticker_symbol (str): Stock ticker symbol
            company_name (str): Company name (optional)
            
        Returns:
            Dict: Complete sentiment analysis results
        """
        logger.info(f"Starting sentiment analysis for {ticker_symbol}")
        
        # Fetch news articles
        articles = self.get_news_articles(ticker_symbol, company_name)
        
        # Analyze sentiment
        sentiment_results = self.analyze_articles_sentiment(articles)
        
        # Add metadata
        sentiment_results.update({
            'ticker': ticker_symbol,
            'company_name': company_name,
            'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'articles_analyzed': len(articles)
        })
        
        return sentiment_results

def analyze_multiple_stocks(tickers: List[str], news_api_key: str, 
                          company_names: Dict[str, str] = None) -> pd.DataFrame:
    """
    Analyze sentiment for multiple stocks and return as DataFrame.
    
    Args:
        tickers (List[str]): List of ticker symbols
        news_api_key (str): NewsAPI key
        company_names (Dict[str, str]): Optional mapping of ticker to company name
        
    Returns:
        pd.DataFrame: Sentiment analysis results for all stocks
    """
    analyzer = StockSentimentAnalyzer(news_api_key)
    results = []
    
    for ticker in tickers:
        company_name = company_names.get(ticker) if company_names else None
        
        try:
            sentiment_data = analyzer.get_stock_sentiment(ticker, company_name)
            results.append(sentiment_data)
            
            # Add delay to respect API rate limits
            time.sleep(0.1)
            
        except Exception as e:
            logger.error(f"Failed to analyze {ticker}: {e}")
            # Add error record
            results.append({
                'ticker': ticker,
                'overall_sentiment': 'neutral',
                'sentiment_score': 0.0,
                'total_articles': 0,
                'confidence': 0.0,
                'error': str(e)
            })
    
    return pd.DataFrame(results)

# Example usage and integration function
def add_sentiment_to_stock_data(df_stocks: pd.DataFrame, news_api_key: str) -> pd.DataFrame:
    """
    Add sentiment analysis to existing stock DataFrame.
    
    Args:
        df_stocks (pd.DataFrame): Existing stock data with 'Ticker' column
        news_api_key (str): NewsAPI key
        
    Returns:
        pd.DataFrame: Stock data with sentiment columns added
    """
    # Extract tickers from the DataFrame
    tickers = df_stocks['Ticker'].tolist()
    
    # Get company names if available
    company_names = {}
    if 'Company_Name' in df_stocks.columns:
        company_names = dict(zip(df_stocks['Ticker'], df_stocks['Company_Name']))
    
    # Analyze sentiment
    sentiment_df = analyze_multiple_stocks(tickers, news_api_key, company_names)
    
    # Merge with original data
    sentiment_columns = [
        'overall_sentiment', 'sentiment_score', 'positive_count', 
        'negative_count', 'neutral_count', 'total_articles', 'confidence'
    ]
    
    # Rename columns for clarity
    sentiment_df_renamed = sentiment_df[['ticker'] + sentiment_columns].rename(columns={
        'ticker': 'Ticker',
        'overall_sentiment': 'News_Sentiment',
        'sentiment_score': 'Sentiment_Score',
        'positive_count': 'Positive_Articles',
        'negative_count': 'Negative_Articles',
        'neutral_count': 'Neutral_Articles',
        'total_articles': 'Total_Articles',
        'confidence': 'Sentiment_Confidence'
    })
    
    # Merge with original DataFrame
    df_with_sentiment = df_stocks.merge(sentiment_df_renamed, on='Ticker', how='left')
    
    return df_with_sentiment

In [4]:
# Example usage
if __name__ == "__main__":
    # Your NewsAPI key
    API_KEY = 'a101780fac934cd6bf8fad7ddff97331'
    
    # Example: Analyze single stock
    analyzer = StockSentimentAnalyzer(API_KEY)
    result = analyzer.get_stock_sentiment('AAPL', 'Apple Inc')
    print("Single Stock Analysis:")
    print(result)
    
    # Example: Analyze multiple stocks
    test_tickers = ['AAPL', 'GOOGL', 'MSFT', 'TSLA']
    test_company_names = {
        'AAPL': 'Apple Inc',
        'GOOGL': 'Google',
        'MSFT': 'Microsoft',
        'TSLA': 'Tesla'
    }
    
    sentiment_df = analyze_multiple_stocks(test_tickers, API_KEY, test_company_names)
    print("\nMultiple Stocks Analysis:")
    print(sentiment_df[['ticker', 'overall_sentiment', 'sentiment_score', 'total_articles']])

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use mps:0


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

INFO:__main__:FinBERT model loaded successfully
INFO:__main__:Starting sentiment analysis for AAPL
INFO:__main__:Fetched 100 articles for AAPL


Single Stock Analysis:
{'overall_sentiment': 'neutral', 'sentiment_score': np.float64(-0.071), 'positive_count': 23, 'negative_count': 30, 'neutral_count': 47, 'total_articles': 100, 'confidence': np.float64(0.356), 'ticker': 'AAPL', 'company_name': 'Apple Inc', 'analysis_date': '2025-06-27 20:26:39', 'articles_analyzed': 100}


Device set to use mps:0
INFO:__main__:FinBERT model loaded successfully
INFO:__main__:Starting sentiment analysis for AAPL
INFO:__main__:Fetched 100 articles for AAPL
INFO:__main__:Starting sentiment analysis for GOOGL
INFO:__main__:Fetched 100 articles for GOOGL
INFO:__main__:Starting sentiment analysis for MSFT
INFO:__main__:Fetched 100 articles for MSFT
INFO:__main__:Starting sentiment analysis for TSLA
INFO:__main__:Fetched 100 articles for TSLA



Multiple Stocks Analysis:
  ticker overall_sentiment  sentiment_score  total_articles
0   AAPL           neutral           -0.071             100
1  GOOGL           neutral            0.037             100
2   MSFT           neutral           -0.075             100
3   TSLA          negative           -0.173             100
