In [1]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv
import json

In [2]:
load_dotenv()

# Set up configuration
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LLM_MODEL = "gpt-4-turbo"
MAX_FILTERED_ARTICLES = 5
ARTICLE_RECENCY_DAYS = 90  # Only consider articles from the last 90 days

# Set API keys
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [3]:
# relevance_template = """
#     You are a financial analyst assistant. Evaluate if the following article 
#     or snippet is relevant for short-term stock investment analysis for {ticker} ({company_name}).
    
#     An article is relevant if it contains information about:
#     1. Earnings reports or financial results
#     2. Analyst recommendations or price targets
#     3. Major business developments (new products, partnerships, etc.)
#     4. Market sentiment or stock performance trends
#     5. Competitive landscape changes
    
#     Article or snippet:
#     {article}
    
#     Respond with:
#     - "RELEVANT" if the article contains useful information for short-term investment decisions
#     - "NOT_RELEVANT" if the article lacks substantial financial insights
#     - Briefly explain your decision in 1-2 sentences.
#     """
    
# relevance_prompt = PromptTemplate(
#     input_variables=["article", "ticker", "company_name"],
#     template=relevance_template
# )
# relevance_prompt

In [4]:
# llm = ChatOpenAI(temperature=0, model=LLM_MODEL)

# # relevance_chain = LLMChain(
# #             llm=llm,
# #             prompt=relevance_prompt
# #         )
# relevance_chain = relevance_prompt | llm | StrOutputParser()
# relevance_chain

In [5]:
# result = relevance_chain.invoke({
#     "article": article[:1500],  # Limit to first 1500 chars
#     "ticker": ticker,
#     "company_name": company_name
# })

# is_relevant = "RELEVANT" in result['text']
# explanation = result['text'].replace("RELEVANT", "").replace("NOT_RELEVANT", "").strip()

# print (is_relevant, explanation)

In [6]:
class FilteringSystem:
    def __init__(self):
        """
        Initialize the Filtering System with necessary components.
        """
        # Initialize LLM
        self.llm = ChatOpenAI(temperature=0, model=LLM_MODEL)
        
        # Create relevance checker prompt
        relevance_template = """
        You are a financial analyst assistant. Evaluate if the following article 
        or snippet is relevant for short-term stock investment analysis for {ticker} ({company_name}).
        
        An article is relevant if it contains information about:
        1. Earnings reports or financial results
        2. Analyst recommendations or price targets
        3. Major business developments (new products, partnerships, etc.)
        4. Market sentiment or stock performance trends
        5. Competitive landscape changes
        
        Article or snippet:
        {article}
        
        Respond with:
        - "RELEVANT" if the article contains useful information for short-term investment decisions
        - "NOT_RELEVANT" if the article lacks substantial financial insights
        - Briefly explain your decision in 1-2 sentences.
        """
        
        self.relevance_prompt = PromptTemplate(
            input_variables=["article", "ticker", "company_name"],
            template=relevance_template
        )
        
        # self.relevance_chain = LLMChain(
        #     llm=self.llm,
        #     prompt=self.relevance_prompt
        # )
    
    def fetch_article_content(self, url):
        """
        Fetch and extract text content from a URL.
        
        Args:
            url (str): Article URL
            
        Returns:
            str: Extracted text content
        """
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Try to get the article date
            date_str = None
            date_tags = soup.find_all(['time', 'meta'], attrs={'datetime': True})
            if date_tags:
                date_str = date_tags[0].get('datetime')
            
            # Check if the article is recent enough
            if date_str:
                try:
                    article_date = datetime.fromisoformat(date_str.split('T')[0])
                    cutoff_date = datetime.now() - timedelta(days=ARTICLE_RECENCY_DAYS)
                    if article_date < cutoff_date:
                        return None, "Article too old"
                except (ValueError, IndexError):
                    pass  # If date parsing fails, continue with content extraction
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.extract()
            
            # Get text
            text = soup.get_text()
            
            # Clean text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = '\n'.join(chunk for chunk in chunks if chunk)
            
            return text, None
        except Exception as e:
            return None, f"Error fetching {url}: {str(e)}"
    
    def check_relevance(self, article, ticker, company_name):
        """
        Check if an article is relevant for stock analysis.
        
        Args:
            article (str): Article text or snippet
            ticker (str): Stock ticker
            company_name (str): Company name
            
        Returns:
            bool: True if relevant, False otherwise
            str: Explanation
        """
        # Create input dictionary
        input_variables = {
            "article": article[:1500],  # Limit to first 1500 chars
            "ticker": ticker,
            "company_name": company_name
        }
        
        # Use the invoke method directly on the chained objects
        result = (self.relevance_prompt | self.llm).invoke(input_variables)
        
        # Extract the content from the AIMessage object
        content = result.content
        
        # Process the content
        is_relevant = "RELEVANT" in content
        explanation = content.replace("RELEVANT", "").replace("NOT_RELEVANT", "").strip()
        
        return is_relevant, explanation
    
    def filter(self, research_results, ticker, company_name):
        """
        Filter search results based on relevance.
        
        Args:
            research_results (dict): Results from ResearchAgent
            ticker (str): Stock ticker
            company_name (str): Company name
            
        Returns:
            list: Filtered articles with full content
        """
        filtered_results = []
        errors = []
        
        # Extract URLs from search results
        search_text = research_results["search_results"]
        
        if isinstance(search_text, str):
            # Extract URLs using regex
            urls = re.findall(r'https?://[^\s]+', search_text)
        else:
            # Assume structured results with URLs
            urls = [item.get('url') for item in search_text if 'url' in item]
        
        for url in urls[:20]:  # Limit to first 20 URLs for efficiency
            # Fetch article content
            content, error = self.fetch_article_content(url)
            
            if error:
                errors.append(error)
                continue
                
            if not content:
                continue
            
            # Check relevance using first part of the article
            is_relevant, explanation = self.check_relevance(content, ticker, company_name)
            
            if is_relevant:
                filtered_results.append({
                    'url': url,
                    'content': content[:1000] + "...",  # Truncate for display
                    'explanation': explanation
                })
            
            if len(filtered_results) >= MAX_FILTERED_ARTICLES:
                break
        
        return {
            "ticker": ticker,
            "company_name": company_name,
            "filtered_articles": filtered_results,
            "errors": errors
        }


In [None]:
# Define a simplified Research Results for testing
# (without requiring the full ResearchAgent)
def get_sample_research_results(ticker, company_name):
    """Create sample research results with some URLs for testing."""
    return {
        "ticker": ticker,
        "company_name": company_name,
        "search_results": f"""
        Here are some recent articles about {company_name} ({ticker}):
        
        1. https://finance.yahoo.com/quote/{ticker}
        2. https://www.marketwatch.com/investing/stock/{ticker}
        3. https://www.cnbc.com/quotes/{ticker}
        4. https://www.reuters.com/markets/companies/{ticker}.O
        5. https://www.bloomberg.com/quote/{ticker}:US
        """
    }

# Test the filtering system with sample data
filtering_system = FilteringSystem()
sample_results = get_sample_research_results("AAPL", "Apple Inc.")
filtered_results = filtering_system.filter(sample_results, "AAPL", "Apple Inc.")

print(f"Found {len(filtered_results['filtered_articles'])} relevant articles")
for i, article in enumerate(filtered_results['filtered_articles']):
    print(f"Article {i+1}: {article['url']}")
    print(f"Explanation: {article['explanation']}")
    print("---")


Found 2 relevant articles
Article 1: https://finance.yahoo.com/quote/AAPL
Explanation: "NOT_"

The provided snippet primarily lists various sections and topics covered by a financial news platform but does not contain specific, actionable information regarding AAPL's earnings, analyst recommendations, major business developments, market sentiment, or competitive landscape changes that would be relevant for short-term investment decisions in Apple Inc. (AAPL).
---
Article 2: https://www.cnbc.com/quotes/AAPL
Explanation: "NOT_"

The provided snippet is primarily a navigation menu from a financial news website and does not contain specific information about AAPL's earnings, analyst recommendations, major business developments, market sentiment, or competitive landscape changes that would be relevant for short-term investment analysis.
---


In [7]:
import sys
import os
from pathlib import Path

sys.path.append(str(Path.cwd().parent))

from agents.research import ResearchAgent

# Initialize agents
research_agent = ResearchAgent()
filtering_system = FilteringSystem()

# Get real research results
research_results = research_agent.research("MSFT", "Microsoft Corporation")
filtered_results = filtering_system.filter(research_results, "MSFT", "Microsoft Corporation")

print(f"Found {len(filtered_results['filtered_articles'])} relevant articles for Microsoft")
for i, article in enumerate(filtered_results['filtered_articles']):
    print(f"Article {i+1}: {article['url']}")
    print(f"Explanation: {article['explanation']}")
    print("---")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'Microsoft Corporation MSFT recent quarterly results earnings analyst ratings business developments October 2023'}`


[0m[36;1m[1;3m[{'title': 'Microsoft Cloud strength fuels first quarter results - Stories', 'url': 'https://news.microsoft.com/2023/10/24/microsoft-cloud-strength-fuels-first-quarter-results-3/', 'content': 'REDMOND, Wash. — October 24, 2023 — Microsoft Corp. today announced the following results for the quarter ended September 30, 2023, as compared to the corresponding period of last fiscal year:\nRevenue was $56.5 billion and increased 13% (up 12% in constant currency)\nOperating income was $26.9 billion and increased 25% (up 24% in constant currency)\nNet income was $22.3 billion and increased 27% (up 26% in constant currency) [...] Skip to Main Content\nSkip to main content\nMicrosoft\nSource\nOur Company AI Innovation Digital Transformation Diversi

In [8]:
import sys
print(sys.path)

['/Users/whysocurious/miniforge3/lib/python310.zip', '/Users/whysocurious/miniforge3/lib/python3.10', '/Users/whysocurious/miniforge3/lib/python3.10/lib-dynload', '', '/Users/whysocurious/Documents/MLDSAIProjects/stock-sage-ai/stock-sage-ai/lib/python3.10/site-packages']


In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')

# Try to get the article date
date_str = None
date_tags = soup.find_all(['time', 'meta'], attrs={'datetime': True})
if date_tags:
    date_str = date_tags[0].get('datetime')