In [14]:
import requests
from bs4 import BeautifulSoup
import re

def fetch_latest_10k(cik):
    """
    Fetch the latest 10-K filing for a given company using its CIK.
    """
    cik = cik.zfill(10)
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    headers = {"User-Agent": "Your Name (your@email.com)"}

    # Fetch recent filings
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print("Error fetching SEC data:", response.status_code)
        return None

    data = response.json()
    recent_filings = data["filings"]["recent"]

    # Find the latest 10-K filing
    for i, form in enumerate(recent_filings["form"]):
        if form == "10-K":
            accession_num = recent_filings["accessionNumber"][i].replace("-", "")
            primary_doc = recent_filings["primaryDocument"][i]
            ten_k_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_num}/{primary_doc}"
            print(f"Latest 10-K URL: {ten_k_url}\n")
            return ten_k_url
    
    print("No recent 10-K filings found.")
    return None


def extract_10k_text(url):
    """
    Extract text from a 10-K filing URL.
    """
    headers = {"User-Agent": "Your Name (your@email.com)"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print("Error fetching 10-K filing:", response.status_code)
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    filing_text = soup.get_text(separator="\n")

    # Save the 10-K document as a text file
    with open("company_10k.txt", "w", encoding="utf-8") as file:
        file.write(filing_text)

    print("\nFull 10-K saved as 'company_10k.txt'.")
    return filing_text


def clean_text_for_llm(text):
    """
    Cleans SEC 10-K text to remove unnecessary structured data.
    """
    # Remove XBRL taxonomy tags (e.g., amzn:XYZMember)
    cleaned_text = re.sub(r'\b[a-zA-Z]+:[A-Za-z]+Member\b', '', text)

    # Keep only relevant parts mentioning product announcements
    product_related_sections = re.findall(
        r"(?i)(?:(?:introduces|announces|launches|new product).{0,200})", cleaned_text
    )

    # If product-related sections are found, use them; otherwise, return the cleaned text
    return "\n".join(product_related_sections) if product_related_sections else cleaned_text


def query_ollama(text, question):
    """
    Query the Deepseek-R1 LLM via Ollama.
    """
    ollama_url = "http://localhost:11434/api/generate"
    payload = {
        "model": "deepseek-r1:1.5b",  
        "prompt": f"{text[:5000]}\n\n{question}",  # Send first 5000 chars only
        "stream": False
    }

    try:
        response = requests.post(ollama_url, json=payload)
        if response.status_code == 200:
            result = response.json()
            return result.get("response", "No response from model")
        else:
            return f"Error: {response.status_code}, {response.text}"
    except Exception as e:
        return f"Error querying Ollama: {str(e)}"


def main():
    """
    Main function to fetch 10-K for any company, extract data, and query Ollama.
    """
    cik = input("Enter the CIK of the company: ").strip()
    ten_k_url = fetch_latest_10k(cik)

    if ten_k_url:
        filing_text = extract_10k_text(ten_k_url)

        if filing_text:
            cleaned_text = clean_text_for_llm(filing_text)

            question = ("Identify any new products mentioned in this document. "
                        "Remove structured tags (e.g., 'amzn:XYZMember') and provide a real product name "
                        "Give me a short 150-character description about the product.")

            answer = query_ollama(cleaned_text, question)
            print("\nLLM Response:\n", answer)


if __name__ == "__main__":
    main()

Latest 10-K URL: https://www.sec.gov/Archives/edgar/data/0000789019/000095017024087843/msft-20240630.htm


Full 10-K saved as 'company_10k.txt'.

LLM Response:
 <think>
Okay, so I'm looking at this user query where they want to identify new products from a provided document, remove any structured tags like Amazon XYZ, and replace them with their real names and brief descriptions. Let's break it down step by step.

First, I need to parse through each sentence of the text provided. The goal is to extract all mentions of actual products. The user has given examples where they've already done this part successfully. 

Looking at some of the sentences, I see terms like "new product," "products and services," and specific companies or names that might be products. For example, in one sentence: "new products and services, we may not grow revenue...". This is a product name.

I should go through each mention of something non-structured tags, replace those with their real names, and create clea