In [None]:
!pip install PyMuPDF spacy

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.5


In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# !pip install PyMuPDF spacy
!python -m spacy download en_core_web_lg  # Using a larger, more accurate model

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import re
import fitz  # PyMuPDF
import spacy

# Load a larger, more accurate spaCy model
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("Downloading 'en_core_web_lg' model. This may take a moment...")
    from spacy.cli import download
    download("en_core_web_lg")
    nlp = spacy.load("en_core_web_lg")

def extract_details_with_ml_fixed(pdf_path):
    """
    Extracts contact details by separating company and customer context.
    Uses a more robust regex and filtering for phone numbers.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        dict: A dictionary containing separated data for company and customer.
    """
    print(f"\n--- Processing {pdf_path} with Fixed ML/NER ---")
    full_text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                full_text += page.get_text("text")
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return {}

    # --- FIXED REGEX ---
    # This regex is more flexible. It looks for sequences of digits and common separators.
    # It's designed to find candidates, which we will filter later.
    phone_regex = r'[\+\(]?\d{1,4}[\s\-\.\)]?\d{2,4}[\s\-\.]?\d{2,4}[\s\-\.]?\d{2,5}'
    email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    customer_info = {"name": None, "email": None, "phone": None}
    company_info = {"name": None, "email": [], "phone": []}

    # Isolate Customer Block ("Bill To")
    bill_to_text = ""
    bill_to_match = re.search(r'Bill To[:\s\n]*((?:.|\n)+?)(?=\n{2,}|Invoice|Item|Description|Notes)', full_text, re.IGNORECASE)
    if bill_to_match:
        bill_to_text = bill_to_match.group(1)
        doc_customer = nlp(bill_to_text)

        for ent in doc_customer.ents:
            if ent.label_ == "PERSON":
                customer_info["name"] = ent.text.strip().replace('\n', ' ')
                break

        customer_info["email"] = re.search(email_regex, bill_to_text).group(0) if re.search(email_regex, bill_to_text) else None

        # Find and validate customer phone
        customer_phone_match = re.search(phone_regex, bill_to_text)
        if customer_phone_match:
            # Filter out numbers from other fields like addresses
            if sum(c.isdigit() for c in customer_phone_match.group(0)) > 7:
                 customer_info["phone"] = customer_phone_match.group(0).strip()

    # Find Company Info in the Full Document
    doc_full = nlp(full_text)

    for ent in doc_full.ents:
        if ent.label_ == "ORG" and "acme" in ent.text.lower():
            company_info["name"] = ent.text.strip().replace('\n', ' ')
            break
    if not company_info["name"] and "Acme" in full_text: company_info["name"] = "Acme"

    all_emails = re.findall(email_regex, full_text)
    all_phones = re.findall(phone_regex, full_text)

    # Assign emails and phones to company, excluding customer's
    company_info["email"] = list(set([e for e in all_emails if e != customer_info["email"]]))

    # Filter and assign phone numbers
    potential_phones = [p.strip() for p in all_phones if p.strip() != customer_info.get("phone")]
    # Filter out short digit sequences (like '030' from IBAN) or address numbers
    company_info["phone"] = list(set([p for p in potential_phones if sum(c.isdigit() for c in p) > 7]))


    return {"customer": customer_info, "company": company_info}

# --- Main Execution ---
if __name__ == "__main__":
    pdf_files = ["invoice2.pdf", "invoice3.pdf", "invoice4.pdf"]

    for pdf in pdf_files:
        extracted_data = extract_details_with_ml_fixed(pdf)
        if extracted_data:
            print(f"Customer Info: {extracted_data['customer']}")
            print(f"Company Info: {extracted_data['company']}")


--- Processing invoice2.pdf with Fixed ML/NER ---
Customer Info: {'name': None, 'email': None, 'phone': None}
Company Info: {'name': 'Acme', 'email': ['support@acme.com'], 'phone': ['149.00\n298.00', '+1 212-608-5983', '4444 555 555', '(10292838282']}

--- Processing invoice3.pdf with Fixed ML/NER ---
Customer Info: {'name': 'Casey Williams', 'email': 'casey@test.com', 'phone': None}
Company Info: {'name': 'Acme', 'email': ['support@acme.com', 'billing@acme.com'], 'phone': ['+1 4444-555-555', '0303443439492', '43823843']}

--- Processing invoice4.pdf with Fixed ML/NER ---
Customer Info: {'name': 'Casey Williams', 'email': 'casey@test.com', 'phone': '2126 085 983'}
Company Info: {'name': 'Acme', 'email': ['support@acme.com'], 'phone': ['4444 555 555']}


In [None]:
import os
import json
import re
import fitz  # PyMuPDF
import google.generativeai as genai
from google.colab import userdata

def extract_details_with_ai(pdf_path):
    """
    Extracts structured company and customer data using a Google's Gemini LLM.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        dict: A dictionary containing the parsed JSON from the AI.
    """
    print(f"\n--- Processing {pdf_path} with AI/LLM ---")

    # 1. Configure the API Key
    try:
        api_key = userdata.get("GOOGLE_API_KEY")
        if not api_key:
            raise ValueError("GOOGLE_API_KEY not found in Colab Secrets Manager.")
        genai.configure(api_key=api_key)
    except Exception as e:
        print(e)
        return None

    # 2. Extract Text from PDF
    full_text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                full_text += page.get_text("text")
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

    # 3. Craft the Advanced Prompt
    prompt = f"""
    You are an expert data extraction AI for invoices.
    From the text below, identify and extract the details for two distinct entities:
    1. The 'company' (the one issuing the invoice, e.g., Acme).
    2. The 'customer' (the one being billed, often under a 'Bill To' section).

    For each entity, extract the following information:
    - name: The full name for a person, or the company name.
    - email: The email address.
    - phone: The phone number.

    Provide the output as a single, clean JSON object with two keys: 'company' and 'customer'.
    Each key should map to an object containing 'name', 'email', and 'phone'.
    If a specific piece of information cannot be found for an entity, use the JSON value `null`.
    Do not include any explanatory text or markdown formatting like ```json before or after the JSON object.

    Text to analyze:
    ---
    {full_text}
    ---
    """

    # 4. Call the AI Model
    try:
        model = genai.GenerativeModel('gemini-flash-latest')
        response = model.generate_content(prompt)

        # Clean the response to ensure it's valid JSON
        # The model might occasionally add markdown formatting
        cleaned_response = re.sub(r'^```json\s*|\s*```$', '', response.text, flags=re.MULTILINE)

        return json.loads(cleaned_response)

    except Exception as e:
        print(f"An error occurred during AI processing for {pdf_path}: {e}")
        print(f"Raw AI Response was: {response.text if 'response' in locals() else 'No response received.'}")
        return None

# --- Main Execution ---
if __name__ == "__main__":
    pdf_files = ["invoice2.pdf", "invoice3.pdf", "invoice4.pdf"]

    for pdf in pdf_files:
        extracted_data_ai = extract_details_with_ai(pdf)
        if extracted_data_ai:
            print(json.dumps(extracted_data_ai, indent=2))


--- Processing invoice2.pdf with AI/LLM ---


ERROR:tornado.access:503 POST /v1beta/models/gemini-flash-latest:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 2552.44ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-flash-latest:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1239.22ms


{
  "company": {
    "name": "Acme",
    "email": "support@acme.com",
    "phone": "4444 555 555"
  },
  "customer": {
    "name": "Casey Williams",
    "email": null,
    "phone": "+1 212-608-5983"
  }
}

--- Processing invoice3.pdf with AI/LLM ---
{
  "company": {
    "name": "Acme",
    "email": "billing@acme.com",
    "phone": "+1 4444-555-555"
  },
  "customer": {
    "name": "Casey Williams",
    "email": "casey@test.com",
    "phone": null
  }
}

--- Processing invoice4.pdf with AI/LLM ---


ERROR:tornado.access:503 POST /v1beta/models/gemini-flash-latest:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 2325.67ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-flash-latest:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1594.32ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-flash-latest:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1039.07ms


{
  "company": {
    "name": "Acme",
    "email": "support@acme.com",
    "phone": "4444 555 555"
  },
  "customer": {
    "name": "Casey Williams",
    "email": "casey@test.com",
    "phone": "2126 085 983"
  }
}


In [None]:
import re
import fitz  # PyMuPDF
import spacy

# Load a larger, more accurate spaCy model
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("Downloading 'en_core_web_lg' model. This may take a moment...")
    from spacy.cli import download
    download("en_core_web_lg")
    nlp = spacy.load("en_core_web_lg")

def extract_details_with_ml_and_visualization(pdf_path):
    """
    Extracts contact details and VISUALIZES the entities found by spaCy.
    """
    print(f"\n--- Processing {pdf_path} with ML/NER and Visualization ---")
    full_text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                full_text += page.get_text("text")
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return {}

    phone_regex = r'[\+\(]?\d{1,4}[\s\-\.\)]?\d{2,4}[\s\-\.]?\d{2,4}[\s\-\.]?\d{2,5}'
    email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    customer_info = {"name": None, "email": None, "phone": None}
    company_info = {"name": None, "email": [], "phone": []}

    # Isolate Customer Block ("Bill To")
    bill_to_text = ""
    bill_to_match = re.search(r'Bill To[:\s\n]*((?:.|\n)+?)(?=\n{2,}|Invoice|Item|Description|Notes)', full_text, re.IGNORECASE)
    if bill_to_match:
        bill_to_text = bill_to_match.group(1)
        doc_customer = nlp(bill_to_text)

        # --- NEW VISUALIZATION PART 1 ---
        print("\n[DEBUG] Entities found by spaCy in the 'Bill To' section:")
        for ent in doc_customer.ents:
            print(f"  - Entity: '{ent.text.strip()}', Label: '{ent.label_}'")
        # --- END OF VISUALIZATION PART ---

        for ent in doc_customer.ents:
            if ent.label_ == "PERSON":
                customer_info["name"] = ent.text.strip().replace('\n', ' ')
                break

        customer_info["email"] = re.search(email_regex, bill_to_text).group(0) if re.search(email_regex, bill_to_text) else None

        customer_phone_match = re.search(phone_regex, bill_to_text)
        if customer_phone_match and sum(c.isdigit() for c in customer_phone_match.group(0)) > 7:
            customer_info["phone"] = customer_phone_match.group(0).strip()

    # Find Company Info in the Full Document
    doc_full = nlp(full_text)

    # --- NEW VISUALIZATION PART 2 ---
    print("\n[DEBUG] Some entities found by spaCy in the full document:")
    # We'll print a sample to avoid a very long list
    for ent in list(doc_full.ents)[:10]: # Print first 10 entities
         print(f"  - Entity: '{ent.text.strip()}', Label: '{ent.label_}'")
    # --- END OF VISUALIZATION PART ---

    for ent in doc_full.ents:
        if ent.label_ == "ORG" and "acme" in ent.text.lower():
            company_info["name"] = ent.text.strip().replace('\n', ' ')
            break
    if not company_info["name"] and "Acme" in full_text: company_info["name"] = "Acme"

    all_emails = re.findall(email_regex, full_text)
    all_phones = re.findall(phone_regex, full_text)

    company_info["email"] = list(set([e for e in all_emails if e != customer_info["email"]]))
    potential_phones = [p.strip() for p in all_phones if p.strip() != customer_info.get("phone")]
    company_info["phone"] = list(set([p for p in potential_phones if sum(c.isdigit() for c in p) > 7]))

    # We print the final result after the debug info
    print("\n--- FINAL EXTRACTED DATA ---")
    return {"customer": customer_info, "company": company_info}

# --- Main Execution ---
if __name__ == "__main__":
    pdf_files = ["invoice2.pdf", "invoice3.pdf", "invoice4.pdf"]

    for pdf in pdf_files:
        extracted_data = extract_details_with_ml_and_visualization(pdf)
        if extracted_data:
            print(f"Customer Info: {extracted_data['customer']}")
            print(f"Company Info: {extracted_data['company']}")


--- Processing invoice2.pdf with ML/NER and Visualization ---

[DEBUG] Some entities found by spaCy in the full document:
  - Entity: '249.00', Label: 'MONEY'
  - Entity: '149.00', Label: 'MONEY'
  - Entity: '298.00', Label: 'CARDINAL'
  - Entity: '200.00', Label: 'MONEY'
  - Entity: '10.00', Label: 'MONEY'
  - Entity: '10%', Label: 'PERCENT'
  - Entity: '15.00', Label: 'MONEY'
  - Entity: '300.00', Label: 'MONEY'
  - Entity: 'Acme', Label: 'ORG'
  - Entity: '787', Label: 'CARDINAL'

--- FINAL EXTRACTED DATA ---
Customer Info: {'name': None, 'email': None, 'phone': None}
Company Info: {'name': 'Acme', 'email': ['support@acme.com'], 'phone': ['149.00\n298.00', '+1 212-608-5983', '4444 555 555', '(10292838282']}

--- Processing invoice3.pdf with ML/NER and Visualization ---

[DEBUG] Entities found by spaCy in the 'Bill To' section:
  - Entity: 'Casey Williams', Label: 'PERSON'
  - Entity: '57', Label: 'CARDINAL'
  - Entity: 'New York', Label: 'GPE'
  - Entity: 'USA', Label: 'GPE'
  - En