In [2]:
# ===============================
# STEP 1: Install dependencies
# ===============================
!pip install -q docling langextract python-dotenv

# ===============================
# STEP 2: Setup API Key
# ===============================
import os
from google.colab import files

# 👉 Replace with your Gemini API key from Google AI Studio
os.environ["LANGEXTRACT_API_KEY"] = "AIzaSyD26IgxKCp24Vgw4n6vFCYJOtjxF3yfUa8"

# ===============================
# STEP 3: Upload a document
# ===============================
print("📂 Please upload a PDF, DOCX, PPTX, or HTML file...")
uploaded = files.upload()
doc_path = list(uploaded.keys())[0]

# ===============================
# STEP 4: Convert document to text with Docling
# ===============================
from docling.document_converter import DocumentConverter

converter = DocumentConverter()
result = converter.convert(doc_path)
doc_text = result.document.export_to_markdown()

print("📑 Extracted Markdown Preview:\n")
print(doc_text[:800])   # preview first 800 chars

# ===============================
# STEP 5: Run LangExtract with Gemini
# ===============================
import textwrap
import langextract as lx

prompt = textwrap.dedent("""\
    Extract named entities useful for financial insights.
    Use exact spans from text (no paraphrasing).
    Classes: PERSON, ORG, LOCATION, DATE, MONEY, EVENT, LAW, PRODUCT.
    For each extraction, include attributes when possible:
    - MONEY: currency, amount, context (funding/revenue/expense/etc.)
    - DATE: normalized_iso if explicit
    - ORG: role (issuer, acquirer, regulator, etc.)
    Do not overlap entities.
""")

examples = [
    lx.data.ExampleData(
        text="On 12 July 2024, ACME Corp raised $15 million in a Series A led by Prime VC.",
        extractions=[
            lx.data.Extraction("DATE", "12 July 2024", {"normalized_iso": "2024-07-12"}),
            lx.data.Extraction("ORG", "ACME Corp", {"role": "issuer"}),
            lx.data.Extraction("MONEY", "$15 million", {"currency": "USD", "amount": "15000000", "context": "funding"}),
            lx.data.Extraction("EVENT", "Series A", {}),
            lx.data.Extraction("ORG", "Prime VC", {"role": "lead investor"}),
        ],
    )
]

result = lx.extract(
    text_or_documents=doc_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash"   # or "gemini-2.5-pro" for deeper tasks
)

# ===============================
# STEP 6: Save outputs
# ===============================
lx.io.save_annotated_documents([result], output_name="extractions.jsonl", output_dir=".")

html = lx.visualize("extractions.jsonl")
with open("visualization.html", "w", encoding="utf-8") as f:
    f.write(html.data if hasattr(html, "data") else html)

print("✅ Done! Saved extractions.jsonl and visualization.html")

# ===============================
# STEP 7: View visualization inside Colab
# ===============================
from IPython.display import HTML
HTML(open("visualization.html", "r", encoding="utf-8").read())

📂 Please upload a PDF, DOCX, PPTX, or HTML file...


Saving Tesla Inc.pdf to Tesla Inc (1).pdf
📑 Extracted Markdown Preview:

Tesla Inc. reported its Q2 2024 financial results on July 20, 2024, announcing a total revenue of  $25.3  billion  and  a  net  profit  of  $3.1  billion.  The  company  highlighted  record  vehicle deliveries, with over 466,000 cars shipped worldwide. Tesla's energy storage division also generated $1.5 billion in revenue, marking a 35% year-over-year growth.

In contrast, Alphabet Inc. posted revenue of $74.6 billion, driven primarily by Google Cloud services, which grew 28% compared to the previous year. Alphabet's stock price closed at $138.75 on Nasdaq after the earnings call. Analysts from Goldman Sachs raised their target price for Alphabet shares to $150, citing strong advertising recovery and continued momentum in AIrelated investments.

Meanwhile, Bank of America announced Q2 2024


TypeError: Extraction.__init__() takes 3 positional arguments but 4 were given