In [None]:
import requests
from bs4 import BeautifulSoup
import re
import os
import json
import csv

########################
# Mappings
########################
CIK_TO_COMPANY = {
    "0000789019": "Microsoft", 
    "320193": "Apple", 
    "1045810": "NVIDIA", 
    "1018724": "Amazon",
    "0001652044": "Alphabet", 
    "40545": "General Electric",
    "50863": "Intel", 
    "0001318605": "Tesla",
    "1326801" : "Meta Platforms",
    "1067983": "Berkshire Hathaway",
    "104169": "Walmart", 
    "1403161": "Visa",
    "19617": "JPMorgan Chase", 
    "1141391": "Mastercard", 
    "34088": "Exxon Mobil", 
    "731766": "UnitedHealth Group",
    "909832": "Costco", 
    "80424": "Proctor & Gamble",
    "1341439": "Oracle", 
    "200406": "Johnson & Johnson",
    "1065280": "Netflix", 
    "354950": "Home Depot", 
    "21344": "Coke", 
    "70858": "Bank of America",
    "93410": "Chevron",
    "858877": "Cisco", 
    "72971": "Well Fargo",
    "77476": "Pepsi", 
    "1321655": "Palantir", 
    "63908": "McDonald's",
    "732717": "AT&T", 
    "732712": "Verizon",
    "895421": "Morgan Stanley", 
    "1744489": "Disney", 
    "318154": "Amgen", 
    "97476": "Texas Instruments",
    "18230": "Caterpillar", 
    "2488": "AMD", 
    "78003": "Pfizer", 
    "1283699": "T-Mobile",
    "101829": "RTX", 
    "885725": "Boston Scientific", 
    "1166691": "Comcast", 
    "12927": "Boeing",
    "936468": "Lockheed Martin", 
    "320187": "Nike", 
    "886982": "Goldman Sachs", 
    "1633917": "Paypal",
    "1571996": "Dell", 
    "37996": "Ford", 
    "33185": "Equifax", 
    "73309": "Nucor",
    "14272": "Bristol", 
    "1730168": "Broadcom", 
    "804328": "Qualcomm",
    "1551152": "Abbvie", 
    "1413329": "Philip Morris",
    "310158": "Merck Co.", 
    "1326160": "Duke Energy",
    "1739940": "Cigna", 
    "1800": "Abbott Laboratories",
    "66740": "3M", 
    "21665": "Colagate-Palmolive",
    "796343": "Adobe", 
    "875320": "Vertex Pharmaceuticals",
}

TICKER_TO_CIK = {
    "MSFT": "0000789019", "AAPL": "0000320193", "NVDA": "1045810", "AMZN": "0001018724",
    "GOOGL": "0001652044", "GE": "0000040545", "INTC": "50863", "TSLA": "0001318605",
    "META": "1326801", "BRK": "0001067983", "WMT": "0000104169", "V": "0001403161",
    "JPM": "0000019617", "MA": "0001141391", "XOM": "0000034088", "UNH": "0000731766",
    "COST": "0000909832", "PG": "0000080424", "ORCL": "0001341439", "JNJ": "0000200406",
    "NFLX": "0001065280", "HD": "0000354950", "KO": "0000021344", "BAC": "0000070858",
    "CVX": "0000093410", "CSCO": "0000858877", "WFC": "0000072971", "PEP": "0000077476",
    "PLTR": "0001321655", "MCD": "0000063908", "T": "0000732717", "VZ": "0000732712",
    "MS": "0000895421", "DIS": "0001744489", "AMGN": "0000318154", "TXN": "0000097476",
    "CAT": "0000018230", "AMD": "0000002488", "PFE": "0000078003", "TMUS": "0001283699",
    "RTX": "000101829", "BSX": "0000885725", "CMCSA": "0001166691", "BA": "0000012927",
    "LMT": "0000936468", "NKE": "0000320187", "GS": "0000886982", "PYPL": "0001633917",
    "DELL": "0001571996", "F": "0000037996", "EFX": "0000033185", "NUE": "0000073309",
    "BMY": "0000014272", "AVGO": "0001730168", "QCOM": "0000804328",
    "ABBV": "0001551152", "PM": "0001413329",
    "MRK": "0000310158", "DUK": "0001326160", "CI": "0001739940",
    "ABT": "0001800", "MMM": "0000066740", "CL": "0000021665",
    "ADBE": "0000796343", "VRTX": "0000875320",
}

########################
# 1) Fetch + Parse 10-K
########################
def fetch_latest_10k(cik):
    cik_padded = cik.zfill(10)
    url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
    headers = {"User-Agent": "Your Name (your@email.com)"}
    
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        print("Error fetching SEC data:", r.status_code)
        return None, None
    
    data = r.json()
    filings = data["filings"]["recent"]
    
    for i, form_type in enumerate(filings["form"]):
        if form_type == "10-K":
            accession_num = filings["accessionNumber"][i].replace("-", "")
            primary_doc = filings["primaryDocument"][i]
            filing_date = filings["filingDate"][i]
            ten_k_url = f"https://www.sec.gov/Archives/edgar/data/{cik_padded}/{accession_num}/{primary_doc}"
            print(f"Latest 10-K URL: {ten_k_url}")
            print(f"Filing Date: {filing_date}\n")
            return ten_k_url, filing_date
    
    print("No recent 10-K found.")
    return None, None

def extract_10k_text(url, ticker):
    headers = {"User-Agent": "Your Name (your@email.com)"}
    resp = requests.get(url, headers=headers)
    if resp.status_code != 200:
        print("Error fetching 10-K:", resp.status_code)
        return None
    
    soup = BeautifulSoup(resp.text, "html.parser")
    text_10k = soup.get_text(separator="\n")
    
    filename = f"{ticker}_10k.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text_10k)
    
    print(f"\nFull 10-K saved as '{filename}'.")
    return text_10k

###############################
# 2) Heuristic Extract (Optional)
###############################
def extract_product_section(text):
    """
    Using a simple keyword approach. If none found, fallback to entire text.
    """
    keywords = ['new product', 'launch', 'introduced', 'announced', 'released']
    lines = text.splitlines()
    relevant = [line for line in lines if any(kw in line.lower() for kw in keywords)]
    if relevant:
        print(f"Extracted product-related section ({len(relevant)} lines).")
        return "\n".join(relevant)
    else:
        print("No product-related lines found; using full text.")
        return text

###############################
# 3) JSON Fallback
###############################
def extract_json_from_response(raw_response):
    match = re.search(r'(\[.*\])', raw_response, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except Exception as e:
            print("Regex extraction error:", e)
    return []

###############################
# 4) LLM Query
###############################
def query_ollama_json(text, company_name, stock_name, filing_time):
    ollama_url = "http://localhost:11434/api/generate"
    prompt = (
        f"Below is a SEC 10-K excerpt for {company_name} (Ticker: {stock_name}, Filing: {filing_time}).\n"
        "Extract new product announcements (incl expansions or redesigns). Return an array of JSON objects with keys:\n"
        '  "company_name", "stock_name", "filing_time", "new_product", "product_description"\n'
        "If none found, return an empty JSON array []. Valid JSON only, no commentary.\n\n"
        f"Text:\n{text[:5000]}"
    )
    
    payload = {
        "model": "deepseek-r1:1.5b",
        "prompt": prompt,
        "stream": False
    }
    
    try:
        r = requests.post(ollama_url, json=payload)
        if r.status_code == 200:
            resp = r.json()
            raw_output = resp.get("response", "")
            # Attempt direct parse
            try:
                out = json.loads(raw_output)
                return out
            except Exception as e:
                print("Error parsing JSON:", e)
                print("Raw response was:", raw_output)
                # Fallback
                return extract_json_from_response(raw_output)
        else:
            print(f"Error {r.status_code}: {r.text}")
            return []
    except Exception as e:
        print("Error querying LLM:", e)
        return []

###############################
# 5) CSV Writer (Filter Non-Dict)
###############################
def write_products_to_csv(company_name, stock_name, filing_time, products, csv_filename="extracted_products.csv"):
    """
    We'll filter out any items that aren't dicts to avoid 'str' object has no attribute 'get'.
    """
    write_header = not os.path.exists(csv_filename)
    with open(csv_filename, "a", newline="", encoding="utf-8") as cf:
        writer = csv.writer(cf)
        if write_header:
            writer.writerow(["Company Name", "Stock Name", "Filing Time", "New Product", "Product Description"])
        
        # Filter non-dict entries
        valid_dicts = [p for p in products if isinstance(p, dict)]
        
        for p in valid_dicts:
            writer.writerow([
                p.get("company_name", company_name),
                p.get("stock_name", stock_name),
                p.get("filing_time", filing_time),
                p.get("new_product", ""),
                p.get("product_description", "")
            ])
    print(f"\nProducts appended to '{csv_filename}' successfully!")

###############################
# 6) Main Pipeline
###############################
def test_pipeline():
    ticker = input("Enter Ticker (e.g. MSFT, AAPL): ").strip().upper()
    
    cik = TICKER_TO_CIK.get(ticker)
    if not cik:
        print(f"Ticker {ticker} not found in TICKER_TO_CIK map.")
        return
    company_name = CIK_TO_COMPANY.get(cik, ticker)
    
    ten_k_url, filing_date = fetch_latest_10k(cik)
    if not ten_k_url:
        return
    
    filing_text = extract_10k_text(ten_k_url, ticker)
    if not filing_text or len(filing_text) < 100:
        print("Failed to extract enough text.")
        return
    
    # Simple product section approach
    product_section = extract_product_section(filing_text)
    
    # LLM Query
    extracted_products = query_ollama_json(product_section, company_name, ticker, filing_date)
    print("\nExtracted Products:", extracted_products)
    
    # CSV Write (filter out strings)
    if extracted_products:
        write_products_to_csv(company_name, ticker, filing_date, extracted_products)
    else:
        print("No product details extracted.")

if __name__ == "__main__":
    test_pipeline()

Latest 10-K URL: https://www.sec.gov/Archives/edgar/data/0000875320/000087532025000053/vrtx-20241231.htm
Filing Date: 2025-02-13


Full 10-K saved as 'VRTX_10k.txt'.
Extracted product-related section (39 lines).
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Raw response was: <think>
Alright, so I need to extract product announcements from VRTX's SEC 10-K filing for the date January 25, 2025. The user provided a detailed excerpt with various news items related to their new products and developments.

First, I should understand what exactly is being requested. The task is to create an array of JSON objects with specific keys: company_name, stock_name, filing_time, new_product, and product_description. If there are no announcements found, return an empty array. Also, only valid JSON without any commentary should be included.

Looking at the provided text, I see several announcements. The FDA approved JOURNAVX for moderate-to-severe acute pain in January 2025. Then, they're