In [1]:
import os
import re
import requests
import mlflow
import tiktoken

from pydantic import BaseModel, Field
from typing import List
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.chat_models import init_chat_model
from langchain.chains import LLMChain

# CONFIGURE MLflow & Vertex AI

mlflow.set_tracking_uri("http://20.75.92.162:5000/")
mlflow.set_experiment("market-sentiment-analyzer")

llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

# Token utils

def count_tokens(text: str, model_name: str = "gemini-2.0-flash") -> int:
    try:
        encoding = tiktoken.encoding_for_model(model_name)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def truncate_text_by_tokens(text: str, max_tokens: int, model_name: str = "gemini-2.0-flash") -> str:
    try:
        encoding = tiktoken.encoding_for_model(model_name)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    if len(tokens) <= max_tokens:
        return text
    truncated_tokens = tokens[:max_tokens]
    truncated_text = encoding.decode(truncated_tokens)
    return truncated_text

def clean_and_truncate_news(raw_news: str, max_tokens: int = 5000) -> str:
    # Remove URLs, emails, excessive whitespace, etc.
    text = raw_news
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,;:\'\"-]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    truncated = truncate_text_by_tokens(text, max_tokens)
    return truncated

# Step 1. Get Stock Code

def get_stock_code(company_name: str) -> str:
    url = f"https://query1.finance.yahoo.com/v1/finance/search?q={company_name}&lang=en-US&region=US"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)

    if resp.status_code != 200:
        raise Exception(f"Yahoo error: {resp.status_code}: {resp.text}")

    data = resp.json()
    try:
        return data["quotes"][0]["symbol"]
    except Exception as e:
        raise Exception("Unable to extract stock code") from e


# Step 2. Fetch & Summarize News Using NewsAPI.org

def fetch_company_news(company_name: str, api_key: str, max_articles=5) -> str:
    url = (
        f"https://newsapi.org/v2/everything?"
        f"q={company_name}&"
        f"language=en&"
        f"sortBy=relevance&"
        f"pageSize={max_articles}&"
        f"apiKey={api_key}"
    )
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)

    if resp.status_code != 200:
        raise Exception(f"News API error: {resp.status_code} - {resp.text}")

    articles = resp.json().get("articles", [])
    news_list = [f"{article.get('title', '')}. {article.get('description', '')}" for article in articles]
    raw_news = "\n".join(news_list)

    return clean_and_truncate_news(raw_news, max_tokens=5000)


# Step 3. Define Output Format Model

class SentimentProfile(BaseModel):
    company_name: str
    stock_code: str
    newsdesc: List[str]
    sentiment: str
    people_names: List[str]
    places_names: List[str]
    other_companies_referred: List[str]
    related_industries: List[str]
    market_implications: str
    confidence_score: float

parser = PydanticOutputParser(pydantic_object=SentimentProfile)

# Step 4. LangChain Prompt Template with manual small format instructions


simple_format_instructions = """
Respond ONLY in valid JSON with the following fields:
- company_name (string)
- stock_code (string)
- newsdesc (list of strings)
- sentiment (string: Positive, Negative, or Neutral)
- people_names (list of strings)
- places_names (list of strings)
- other_companies_referred (list of strings)
- related_industries (list of strings)
- market_implications (string)
- confidence_score (float between 0 and 1)
"""

prompt_template = PromptTemplate(
    template="""
Analyze the following news for company sentiment and details.

Company: {company_name}
Stock Code: {stock_code}
News Summary: {news_summaries}

Please respond in this structured format:
{format_instructions}
""",
    input_variables=["company_name", "stock_code", "news_summaries"],
    partial_variables={"format_instructions": simple_format_instructions}
)


# Debug token counts before LLM call


def debug_token_counts(company_name, stock_code, news_summaries, prompt_template):
    prompt_without_news = prompt_template.template.format(
        company_name=company_name,
        stock_code=stock_code,
        news_summaries="",
        format_instructions=simple_format_instructions,
    )
    tokens_non_news = count_tokens(prompt_without_news)
    
    tokens_news = count_tokens(news_summaries)
    
    total_tokens = count_tokens(prompt_template.template.format(
        company_name=company_name,
        stock_code=stock_code,
        news_summaries=news_summaries,
        format_instructions=simple_format_instructions,
    ))

    print(f"[DEBUG] Tokens - Non-news parts: {tokens_non_news}")
    print(f"[DEBUG] Tokens - News summaries combined: {tokens_news}")
    print(f"[DEBUG] Tokens - Total prompt with news: {total_tokens}")

# Step 5. Main pipeline function


def analyze_company(company_name: str, news_api_key: str) -> dict:
    with mlflow.start_run(run_name=f"sentiment-{company_name}"):
        mlflow.log_param("company_name", company_name)

        stock_code = get_stock_code(company_name)
        mlflow.log_param("stock_code", stock_code)

        news_summary = fetch_company_news(company_name, news_api_key)
        mlflow.log_text(news_summary, "news_summary.txt")

        # Debug token counts
        debug_token_counts(company_name, stock_code, news_summary, prompt_template)

        # Prepare final prompt
        final_prompt = prompt_template.format(
            company_name=company_name,
            stock_code=stock_code,
            news_summaries=news_summary,
        )

        # Check final prompt tokens again and truncate if necessary
        total_tokens = count_tokens(final_prompt)
        max_allowed_tokens = 1048575  # Gemini max

        if total_tokens > max_allowed_tokens:
            # aggressively truncate news summary portion only
            allowed_news_tokens = max_allowed_tokens - count_tokens(prompt_template.template.format(
                company_name=company_name,
                stock_code=stock_code,
                news_summaries="",
                format_instructions=simple_format_instructions,
            ))
            truncated_news = truncate_text_by_tokens(news_summary, allowed_news_tokens)
            final_prompt = prompt_template.format(
                company_name=company_name,
                stock_code=stock_code,
                news_summaries=truncated_news,
            )
            #print(f"[DEBUG] Truncated news summary to fit token limit. New token count: {count_tokens(final_prompt)}")

        # Run LLM
        raw_output = llm.invoke(final_prompt)

        # print("LLM Output:\n", raw_output.content)

        # Parse output
        sentiment_profile = parser.parse(raw_output.content)

        # Log output
        mlflow.log_dict(sentiment_profile.dict(), "sentiment_profile.json")
        mlflow.log_metric("confidence_score", sentiment_profile.confidence_score)

        return sentiment_profile.dict()



# Run example

if __name__ == "__main__":
    company_name = input("Enter the company name: ").strip()
    #news_api_key = os.getenv("NEWSAPI_KEY") or input("Enter your NewsAPI.org API key: ").strip()
    news_api_key='fe3fd9191a6147dd961d7207f4d0716d'
    result = analyze_company(company_name, news_api_key)
    print("Sentiment Profile:")
    print(result)



[DEBUG] Tokens - Non-news parts: 131
[DEBUG] Tokens - News summaries combined: 287
[DEBUG] Tokens - Total prompt with news: 418


/tmp/ipykernel_4526/3526556275.py:221: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  mlflow.log_dict(sentiment_profile.dict(), "sentiment_profile.json")
/tmp/ipykernel_4526/3526556275.py:224: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return sentiment_profile.dict()


🏃 View run sentiment-Amazon at: http://20.75.92.162:5000/#/experiments/477038078762324239/runs/9fd793b48cf24cffadfbc915799bcd82
🧪 View experiment at: http://20.75.92.162:5000/#/experiments/477038078762324239
Sentiment Profile:
{'company_name': 'Amazon', 'stock_code': 'AMZN', 'newsdesc': ['Amazon s 2025 hardware event: the 8 biggest announcements.', 'Amazon just finished up its fall event, where it shared big updates across its entire hardware lineup.', 'In addition to revealing new Echo hardware and Kindle Scribe upgrades, Amazon also took the wraps off refreshed Fire TV devices and a whole bunch more.', 'Here Amazon announces a new Echo Studio.', 'Amazon revealed a new Echo Studio smart speaker at its fall 2025 hardware event on Tuesday.', 'A new speaker wasn t a total surprise, as Amazon s event invite hinted strongly that there might be new Echo devices revealed at the show.', 'The Studio is the most adva Everything Amazon Announced Today at Its Fall Hardware Event 2025 .', "Amazon'