In [113]:
import os
import json
import re
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import ollama
import whisper
from PyPDF2 import PdfReader
from PIL import Image
import pytesseract

In [123]:
CONTROL_CHARS = ''.join(map(chr, range(0, 32)))
CONTROL_CHAR_RE = re.compile('[%s]' % re.escape(CONTROL_CHARS))

def safe_json_loads(raw: str):
    raw = raw.strip()

    # Remove markdown fences
    raw = raw.replace("```json", "").replace("```", "").strip()

    # Remove ASCII control characters
    raw = CONTROL_CHAR_RE.sub('', raw)

    # Extract ONLY the first JSON object
    match = re.search(r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}", raw)
    if not match:
        raise ValueError("No JSON object found in LLM output:\n" + raw)

    json_str = match.group(0)

    # Try strict parse
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        pass

    # Attempt repairs
    json_str = json_str.replace("'", '"')
    json_str = re.sub(r",\s*}", "}", json_str)
    json_str = re.sub(r",\s*]", "]", json_str)

    # Remove control chars again
    json_str = CONTROL_CHAR_RE.sub('', json_str)

    return json.loads(json_str)


In [124]:
def list_files(directory: str):
    p = Path(directory)
    return [
        str(f)
        for f in p.iterdir()
        if f.is_file() and f.suffix.lower() in [
            ".pdf", ".m4a", ".mp3", ".wav", ".png", ".jpg", ".jpeg"
        ]
    ]


def read_pdf(filepath: str) -> str:
    try:
        reader = PdfReader(filepath)
        text = ""
        for page in reader.pages:
            extracted = page.extract_text()
            if extracted:
                text += extracted
        return text
    except Exception as e:
        return f"[PDF_ERROR] {e}"


def transcribe_audio(filepath: str) -> str:
    try:
        model = whisper.load_model("base")
        result = model.transcribe(filepath)
        return result["text"]
    except Exception as e:
        return f"[AUDIO_ERROR] {e}"


def ocr_image(filepath: str) -> str:
    try:
        img = Image.open(filepath)
        text = pytesseract.image_to_string(img)
        return text
    except Exception as e:
        return f"[IMAGE_ERROR] {e}"


def read_template(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


def extract_placeholders(template: str):
    return re.findall(r"{(.*?)}", template)


def fill_template(template: str, data: dict) -> str:
    content = template
    for key in extract_placeholders(template):
        value = data.get(key, f"No {key} found")
        if isinstance(value, list):
            value = "\n".join(str(v) for v in value)
        content = content.replace(f"{{{key}}}", str(value))
    return content


def save_markdown(filename: str, content: str, out_dir: str) -> str:
    out_dir_path = Path(out_dir)
    out_dir_path.mkdir(parents=True, exist_ok=True)

    base = filename.lower().replace(" ", "-")
    base = re.sub(r"[^a-z0-9\-]", "-", base)
    final_name = base + ".md"

    out_path = out_dir_path / final_name
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(content)

    return str(out_path)



In [125]:
TASK_SYSTEM = """
You interpret the user's instruction and decide whether the agent should process documents.

Return ONLY valid JSON:

{
  "should_process": true/false,
  "task_type": "<string or null>",
  "task_description": "<string or null>"
}

Rules:
- If the instruction is a greeting, small talk, or unrelated to document processing → should_process = false.
- If the instruction describes a document-processing task (summarize, extract key points, list actions, etc.) → should_process = true.
- task_description must be a short natural-language description of what to do.
- No extra text. No markdown. Only JSON.
"""

def interpret_task(user_instruction: str) -> dict:
    messages = [
        {"role": "system", "content": TASK_SYSTEM},
        {"role": "user", "content": user_instruction}
    ]
    resp = ollama.chat(model="phi3:3.8b", messages=messages)
    raw = resp["message"]["content"].strip()
    return safe_json_loads(raw)

TOOL_CHOICE_SYSTEM = """
You decide which tool to use for a file.

Tools:
- read_pdf → .pdf
- ocr_image → .png, .jpg, .jpeg
- transcribe_audio → .mp3, .wav, m4a

Return ONLY:
{"tool": "<read_pdf|ocr_image|transcribe_audio>"}
"""

def call_llm_tool_choice(filename: str) -> str:
    messages = [
        {"role": "system", "content": TOOL_CHOICE_SYSTEM},
        {"role": "user", "content": filename}
    ]
    resp = ollama.chat(model="phi3:3.8b", messages=messages)
    raw = resp["message"]["content"].strip()
    return safe_json_loads(raw)["tool"]

CLEAN_TEXT_SYSTEM = """
Clean the text. Follow these rules:
- Output only the cleaned text.
- No explanations, notes, lists, commentary, or summaries.
- Do not change the meaning or add missing information.
Your task:
- Fix obvious OCR/transcription errors.
- Correct spelling when the intent is clear.
- Restore punctuation, spacing, and broken words/lines.
- Normalise dates/numbers when unambiguous.
- Remove garbage symbols and artefacts.
- Preserve the original meaning.
Return only the cleaned text.
"""


def clean_text(raw_text: str) -> str:
    messages = [
        {"role": "system", "content": CLEAN_TEXT_SYSTEM},
        {"role": "user", "content": raw_text}
    ]
    resp = ollama.chat(model="phi3:3.8b", messages=messages)
    return resp["message"]["content"].strip()

EXTRACT_SYSTEM = """
You extract structured information from text and suggest filenames.

Return ONLY:
{
  "fields": { ... },
  "suggested_filename": "..."
}

FILENAME RULES:
Use key extracted fields to create a short, descriptive, lowercase, hyphenated filename with no extension. Remove special characters. If useful fields are missing, generate a simple topic‑based name.
EXAMPLES:
invoice-2024-04-12-abc-ltd
meeting-q1-strategy
legal-letter-john-doe
project-alpha-status

GENERAL BEHAVIOUR:
- Never add commentary.
- Never output markdown.
- Never include text outside the JSON.

"""

def call_llm_extract(text: str, fields: list[str], template: str, task_description: str) -> dict:
    messages = [
        {"role": "system", "content": EXTRACT_SYSTEM},
        {"role": "user", "content": json.dumps({
            "fields": fields,
            "text": text[:3000],
            "template": template,
            "task": task_description
        })}
    ]
    resp = ollama.chat(model="llama3.1", messages=messages)
    raw = resp["message"]["content"].strip()
    return safe_json_loads(raw)

In [126]:
def process_file(filepath: str, template: str, out_dir: str, task_description: str):
    suffix = Path(filepath).suffix.lower()
    filename = os.path.basename(filepath)

    # LLM decides tool
    tool = call_llm_tool_choice(filename)
    print("Detect " + suffix + " file -> Use " + tool + " to process the file.")

    # Execute tool
    if tool == "read_pdf":
        raw_text = read_pdf(filepath)
    elif tool == "ocr_image":
        raw_text = ocr_image(filepath)
    elif tool == "transcribe_audio":
        raw_text = transcribe_audio(filepath)

    cleaned_text = clean_text(raw_text)
    # Extract fields + filename
    fields = extract_placeholders(template)
    print(fields)
    
    result = call_llm_extract(raw_text, fields, template, task_description)

    field_values = result["fields"]
    suggested_filename = result["suggested_filename"]
    print(field_values)

    # Fill template
    content = fill_template(template, field_values)

    # Save
    saved_path = save_markdown(suggested_filename, content, out_dir)
    print(f"Saved: {saved_path}")


def run_agent(documents_dir: str, template_path: str, out_dir: str, user_instruction: str):
    task = interpret_task(user_instruction)

    if not task["should_process"]:
        print("User instruction does not request document processing. Exiting.")
        return

    # Pass task_description into extraction
    task_description = task["task_description"]
    template = read_template(template_path)
    files = list_files(documents_dir)

    print(f"Found {len(files)} files:")
    for f in files:
        print(" -", f)

    # Paralle Processing
    with ThreadPoolExecutor(max_workers=4) as ex:
        futures = [
            ex.submit(process_file, f, template, out_dir, task_description)
            for f in files
        ]
        for fut in futures:
            fut.result()  # propagate errors



In [127]:
try:
    BASE_DIR = Path(__file__).parent
except NameError:
    BASE_DIR = Path.cwd()

DEFAULT_DOCUMENTS_DIR = str(BASE_DIR / "documents")
DEFAULT_TEMPLATE_PATH = str(BASE_DIR / "template.md")
DEFAULT_OUTPUT_DIR = str(BASE_DIR / "output")

In [129]:
if __name__ == "__main__":
    user_instruction = input("What would you like me to do?\n> ")

    documents_dir = DEFAULT_DOCUMENTS_DIR
    template_path = DEFAULT_TEMPLATE_PATH
    output_dir = DEFAULT_OUTPUT_DIR

    run_agent(documents_dir, template_path, output_dir, user_instruction)

What would you like me to do?
>  extract the key info from audio


Found 1 files:
 - C:\Users\Bunny Liu\Desktop\azzurroassociates\documents\team_sync.m4a
Detect .m4a file -> Use transcribe_audio to process the file.
['participants', 'overview', 'key_points', 'next_steps']
{'participants': ['Smith', 'Priya Patel', 'Marcus Lee', 'Sarah Chen'], 'overview': 'the team reveal current progress of project Aurora', 'key_points': ['Development of the new analytics dashboard is 80% complete', 'Backend integration schedule to finish by the end of next week', 'Finalized dashboard backend integration by 2020 January'], 'next_steps': ['Deliver update mobile layouts by 70 January', 'Prepare alternative messaging. Angles for reveal', 'API error handling module', 'Validate data accuracy against the latest dataset', 'Coordinate with marketing on launch assets']}
Saved: C:\Users\Bunny Liu\Desktop\azzurroassociates\output\project-aurora-status-update.md
