<div style="background-color: #ADD8E6; border: 1px solid gray; padding: 3px">
    This notebook consists of 2 agentic workflows:
        <h3>Data Generation Workflow</h3>
        <li><b>Data Augmentation</b>: Augments the provided image dataset.</li>
        <h3>Validation Workflow</h3>
        <li><b>Image Validator</b>: Identifies whether a valid driver's license exists in the given image.</li>
        <li><b>Data Extractor</b>: Extracts relevant metadata from the image.</li>
        <li><b>Application Validator</b>: Given the extracted metadata associated with the application, uses a set of predefined rules to validate the driver's license application.</li>
</div>

In [None]:
##############################################################################
# Imports
##############################################################################
# import pysqlite3 as sqlite3
# import sys
# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import operator
from typing import Annotated, TypedDict, List, Optional, Literal
from langgraph.graph import StateGraph, END, START
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from PIL import Image
import pytesseract
import io
import json
from datetime import datetime
import re
import os
import requests
from flow_extensions import CustomLLMMultimodalBlock, CustomDeleteColumnsBlock
from io import BytesIO
from dotenv import load_dotenv
import mimetypes
import base64
from urllib.parse import urlparse
from PIL import Image
from io import BytesIO
import requests
load_dotenv()
import traceback
from openai import OpenAI
import instructor
from pydantic import BaseModel, Field, TypeAdapter
from more_itertools import chunked
import utils
from datasets import load_dataset, DatasetDict, Dataset
from sdg_hub.core.flow import FlowRegistry, Flow
import pandas as pd
from typing import Any, Optional
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [None]:
##############################################################################
# LLMs
##############################################################################

vision_llm = instructor.from_openai(OpenAI(
        
    api_key=os.getenv('LLAMASCOUT4_LLM_KEY'),
    
    base_url=os.getenv('LLAMASCOUT4_LLM_BASE'),
))

In [None]:
##############################################################################
# Structured Output
##############################################################################

class DriversLicenseField(BaseModel):
    
    value: str = Field("", description="Name of field")
    
    missing_error_reason: str = Field("", description="Reason for missing field")

    is_valid: Optional[bool] = Field(None, description="Indicates whether the license is valid.")
    
    application_value: str = Field("", description="Value of the corresponding field in the application")
    
    invalid_error_reason: str = Field("", description="Reason for invalid field")

    

class DriversLicenseMetadata(BaseModel):

    application_id: str = Field("", description="Unique identifier")

    model: str = Field("", description="Name of LLM used to generate metadata")
    
    name: DriversLicenseField = Field(description="Name of driver's license owner")
    
    date_of_birth: DriversLicenseField = Field(description="Date of birth of driver's license owner")
    
    expiration_date: DriversLicenseField = Field(description="Expiration date of driver's license")
    
    state_issued: DriversLicenseField = Field(description="State where the license was issued")
    
    issuance_date: DriversLicenseField = Field(description="Date when the license was issued")

    photo_orientation: DriversLicenseField = Field(description="The skew of the license in the photo")

class LicenseApplication(BaseModel):

    application_id: str = Field(description="Unique identifier")

    image_path: str = Field(description="Image path")

    application_data: dict = Field(description="Submitted application data")
    

In [None]:
##############################################################################
# Tools
##############################################################################

def image_to_base64(image_path, encode_image_bytes=False):
    """Transforms image at provided local path or URL into base64-encoded representation."""
    
    def is_valid_http_url(input_path):
        """Returns whether or not the input is a valid URL."""
    
        parsed_url = urlparse(input_path)
    
        is_http_url = all([parsed_url.scheme in ('http', 'https'), parsed_url.netloc])
    
        return is_http_url
        
    
    try:
        start_time = datetime.now()
    
        mime_type, _ = mimetypes.guess_type(image_path)
    
        if "image" in mime_type:
    
            if encode_image_bytes:
    
                if is_valid_http_url(image_path):
        
                    response = requests.get(image_path)
    
                    response.raise_for_status()
    
                    stream_to_read = response.content
    
                else:
                    
                    with open(image_path, "rb") as image_file:
                        
                        stream_to_read = image_file.read()
    
                img = base64.b64encode(stream_to_read).decode("utf-8")
        
                return f"data:{mime_type};base64,{img}"
    
            else:
    
                return image_path
                
    
        else:
    
            raise Exception(f"Mime type {mime_type} not supported")
        
        processing_time = (datetime.now() - start_time).total_seconds()
        
        print(f"Image loaded: time: {processing_time:.2f}s")
            
    except Exception as e:
        
        error_msg = f"Image loading error: {str(e)}"
    
        print(f"- {error_msg}")

        traceback.print_exc()
    
    return None

In [None]:
##############################################################################
# Prompts
##############################################################################
content_extractor_system_prompt = """You are an expert at extracting information from U.S. driver's licenses.

Given the above image and context, extract the following data from the image. If you cannot extract the specified data, say that the field or characteristic was missing or unclear.

Return JSON:
{{
  "name": {
    "value": "The driver license owner name",
    "missing_error_reason": "Explanation of missing name, if applicable, or blank if the driver license owner is present"
   },
   "date_of_birth": {
    "value": "The date of birth of the driver's license owner",
    "missing_error_reason": "Explanation of missing date of birth, if applicable, or blank if the date of birth is present"
   },
   "expiration_date": {
    "value": "The expiration date of the driver's license",
    "missing_error_reason": "Explanation of missing expiration date, if applicable, or blank if the expiration date is present"
   },
   "issuance_date": {
    "value": "The issuance date of the driver's license",
    "missing_error_reason": "Explanation of missing issuance date, if applicable, or blank if the issuance date is present"
   },
   "state_issued": {
    "value": "The driver license state",
    "missing_error_reason": "Explanation of missing state, if applicable, or blank if the state is present"
   },
   "photo_orientation": {
    "value": "Whether or not the license in the image is skewed",
    "missing_error_reason": "Explanation if the skew could not be detected, or blank if the skew could be detected"
   },
}}
"""

content_extractor_human_prompt = """
"Image:\n\n{image}\n\nExtracted data:
"""

image_validation_system_prompt = """
You are a data quality expert. Validate this driver's license data.

Return JSON:
{{
  "is_valid": true/false,
  "completeness_score": 0-100,
  "confidence_score": 0-100,
  "critical_issues": ["list critical problems"],
  "warnings": ["list minor issues"],
  "missing_fields": ["list missing required fields"],
  "recommendations": ["improvement suggestions"]
}}"""

image_validation_human_prompt = """You are a data quality expert. Validate this driver's license data.

Check for:
1. Critical fields present (license_number, name, date_of_birth)
2. Date format correctness and logical consistency
3. State codes are valid (2 letters)
4. ZIP codes are 5 digits
5. Gender codes are valid (M, F, X)
6. Expiration date is after issue date
7. Issue date is not in the future
8. Person is old enough to drive (check date_of_birth)

Return JSON:
{{
  "is_valid": true/false,
  "completeness_score": 0-100,
  "confidence_score": 0-100,
  "critical_issues": ["list critical problems"],
  "warnings": ["list minor issues"],
  "missing_fields": ["list missing required fields"],
  "recommendations": ["improvement suggestions"]
}}"""

In [None]:
##############################################################################
# Test
##############################################################################
image_path = "https://raw.githubusercontent.com/agapebondservant/dla_poc/refs/heads/main/notebooks/data2/DENVER-25CAP-00000-04SVL-ID.jpeg"

base64_data_url = image_to_base64(image_path, encode_image_bytes=False)

if base64_data_url:
    
    response = vision_llm.chat.completions.create(
    
        model=os.getenv('LLAMASCOUT4_LLM_NAME'),
        
        messages=[
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": content_extractor_system_prompt},
                ]
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": content_extractor_human_prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": base64_data_url, "detail": "high"}, 
                    },
                ],
            }
        ],

        response_model=DriversLicenseMetadata,
        
        max_tokens=8192,

        temperature=0,
    )
    print(response)

In [None]:
# completion_content = response.choices[0].message.content
# completion_content.replace("```json", "").replace("```", "")
# content = json.loads(response.choices[0].message.content)
# metadata = DriversLicenseMetadata.model_validate(content)
# metadata.name.field
# content
response.name

In [None]:
##############################################################################
# State Definitions
##############################################################################

class LicenseState(TypedDict):
    """Enhanced state definition"""
    image_applications: List[LicenseApplication]
    
    image_data: Annotated[List[DriversLicenseMetadata], operator.add]
    
    messages: Annotated[List[BaseMessage], operator.add]

In [None]:
##############################################################################
# LLMs
##############################################################################

main_llm = ChatOpenAI(
    
    model=os.getenv('LLAMASCOUT4_LLM_NAME'),
    
    api_key=os.getenv('LLAMASCOUT4_LLM_KEY'),
    
    base_url=os.getenv('LLAMASCOUT4_LLM_BASE'),

    max_tokens = 8192,

    temperature = 0
)

In [None]:
##############################################################################
# Nodes
##############################################################################

def extract_license_data_with_llm(state: LicenseState) -> LicenseState:
    """Node: Extract structured data"""
    print("✓ STEP 1: AI-Powered Data Extraction")
    
    print("="*60)
    
    llm = main_llm
    
    extraction_prompt = ChatPromptTemplate.from_messages([
        
        ("system", extract_data_system_prompt),
        
        ("human", content_extractor_human_prompt)
    ])
    
    try:
        start_time = datetime.now()
        
        chain = extraction_prompt | llm
        
        response = chain.invoke({"image": state["image_data"]})
        
        response_text = response.content if hasattr(response, 'content') else str(response)

        print(response_text)
        
        json_text = response_text
        if "```json" in response_text:
            
            json_text = response_text.split("```json")[1].split("```")[0]
            
        elif "```" in response_text:
            
            json_text = response_text.split("```")[1].split("```")[0]
        
        extracted_data = json.loads(json_text.strip())
            
        state["extracted_data"] = extracted_data
        
        processing_time = (datetime.now() - start_time).total_seconds()

        non_null_fields = sum(1 for v in extracted_data.values() if v is not None and v != "")
        
        total_fields = len(extracted_data)
        
        print(f"✅ Extraction completed")
        
        print(f"   Fields extracted: {non_null_fields}/{total_fields}")
        
        print(f"   Time: {processing_time:.2f}s")
        
        print(f"\n   Extracted data:")
        
        print(f"   {'-'*56}")
        
        for key, value in list(extracted_data.items())[:10]:
            
            if value:
                
                display_value = str(value)[:40]
                
                print(f"   {key}: {display_value}")
                
        print(f"   {'-'*56}")
        
        state["messages"].append(AIMessage(content=response_text))
        
    except Exception as e:
        error_msg = f"Extraction error: {str(e)}"
        
        state["errors"].append(error_msg)
        
        print(f"❌ {error_msg}")
        
        state["extracted_data"] = {}
    
    return state


def validate_extracted_data(state: LicenseState) -> LicenseState:
    """Node: Validate extracted data"""
    print("✓ STEP 3: Data Validation")
    
    print("="*60)
    
    llm = main_llm
    
    validation_prompt = ChatPromptTemplate.from_messages([
        ("system", image_validation_system_prompt),
        ("human", image_validation_human_prompt)
    ])
    
    try:
        start_time = datetime.now()
        
        chain = validation_prompt | llm
        response = chain.invoke({
            "data": json.dumps(state["extracted_data"], indent=2)
        })
        
        response_text = response.content if hasattr(response, 'content') else str(response)
        
        json_text = response_text
        
        if "```json" in response_text:
            
            json_text = response_text.split("```json")[1].split("```")[0]
            
        elif "```" in response_text:
            
            json_text = response_text.split("```")[1].split("```")[0]
        
        validation_result = json.loads(json_text.strip())
        
        state["validation_result"] = validation_result
        
        processing_time = (datetime.now() - start_time).total_seconds()
        
        print(f"✅ Validation completed")
        print(f"   Valid: {validation_result.get('is_valid', False)}")
        print(f"   Completeness: {validation_result.get('completeness_score', 0):.0f}%")
        print(f"   Confidence: {validation_result.get('confidence_score', 0):.0f}%")
        print(f"   Time: {processing_time:.2f}s")
        
        if validation_result.get('critical_issues'):
            
            print(f"\n   ⚠️  Critical Issues:")
            
            for issue in validation_result['critical_issues']:
                
                print(f"      • {issue}")
        
        if validation_result.get('warnings'):
            
            print(f"\n   ⚡ Warnings:")
            
            for warning in validation_result['warnings']:
                
                print(f"      • {warning}")
        
    except Exception as e:
        error_msg = f"Validation error: {str(e)}"
        
        state["errors"].append(error_msg)
        
        print(f"❌ {error_msg}")
        
        state["validation_result"] = {
            "is_valid": False,
            "completeness_score": 0,
            "confidence_score": 0,
            "critical_issues": ["Validation process failed"],
            "warnings": [],
            "missing_fields": [],
            "recommendations": ["Manual review required"]
        }
    
    return state


def compile_final_result(state: LicenseState) -> LicenseState:
    """Node: Compile final results"""
    print("✓ STEP 4: Compiling Final Results")
    
    print("="*60)
    
    has_errors = len(state["errors"]) > 0
    
    is_valid = state["validation_result"].get("is_valid", False)
    
    completeness = state["validation_result"].get("completeness_score", 0)
    
    if has_errors:
        
        status = "error"
        
    elif not is_valid or completeness < 50:
        
        status = "needs_review"
        
    elif completeness < 80:
        
        status = "partial"
        
    else:
        
        status = "success"
    
    state["result"] = {
        
        "status": status,
        
        "extracted_data": state["extracted_data"],
        
        "validation": state["validation_result"],
        
        "metadata": {
            
            "processing_timestamp": datetime.now().isoformat(),
            
            "retry_count": state.get("retry_count", 0)
        },
        "errors": state["errors"],
        
        "warnings": state["warnings"]
    }
    
    print(f"✅ Status: {status.upper()}")
    
    print(f"   Errors: {len(state['errors'])}")
    
    print(f"   Warnings: {len(state['warnings'])}")
    
    return state

In [None]:
##############################################################################
# Graph
##############################################################################
def create_license_extraction_graph():
    """Create the LangGraph workflow"""
    workflow = StateGraph(LicenseState)
    
    # Add all nodes
    workflow.add_node("load_image", load_and_preprocess_image)
    
    workflow.add_node("extract_data", extract_license_data_with_llm)
    
    # workflow.add_node("validate_data", validate_extracted_data)
    
    workflow.add_node("compile_result", compile_final_result)
    
    # Define edges
    workflow.add_edge(START, "load_image")
    
    workflow.add_edge("load_image", "extract_data")
    
    # workflow.add_edge("extract_data", "validate_data")
    
    # workflow.add_edge("validate_data", "compile_result")

    workflow.add_edge("extract_data", "compile_result")
    
    workflow.add_edge("compile_result", END)
    
    return workflow.compile()

### Execute Code Translation Flow
Execute the flow!

In [None]:
##############################################################################
# Execute the Flow
##############################################################################
paths = [
    ("https://raw.githubusercontent.com/agapebondservant/dla_poc/refs/heads/main/notebooks/data2/DENVER-25CAP-00000-04SVL-ID.jpeg", "data2/DENVER-25CAP-00000-04SVL-ID.json")
]

# applications_raw = utils.group_files_by_id(
#             "https://github.com/agapebondservant/dla_poc", 
#             "notebooks/data2")

def convert_to_license_applications(repo_url: str, repo_branch: str = "main", folder_path: str) -> list[LicenseApplication]:
    
        # Retrieve repo file data
        applications_raw = utils.group_files_by_id(
            repo_url, 
            folder_path)
        
        adapter = TypeAdapter(list[LicenseApplication])
    
        applications = adapter.validate_python(applications_raw)

        return applications
    

def extract_license_info(model: string, application: LicenseApplication) -> dict:
        """
        Main extraction function
        """
    # image_applications: List[LicenseApplication]
    
    # image_data: Annotated[List[DriversLicenseMetadata], operator.add]
    
    # messages: Annotated[List[BaseMessage], operator.add]

        start_time = datetime.now()
    
        print(f"Git Repo: {repo_url}")
    
        print(f"Started: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

        app = create_license_extraction_graph()

        # Set list of models
        vision_models = ["LLAMASCOUT4", "GEMMA27B", "GEMMA12B", "GEMMA4B"]

        # Retrieve repo file data
        applications_raw = utils.group_files_by_id(
            "https://github.com/agapebondservant/dla_poc", 
            "notebooks/data2")
        
        adapter = TypeAdapter(list[LicenseApplication])
    
        applications = adapter.validate_python(applications_raw)

        clusters = list(chunked(my_list, 3))

        for application in applications:

            for vision_model in vision_models:
        
                initial_state = {
                    "image_path": image_path,
                    
                    "image_data": None,
                    
                    "extracted_data": {},
                    
                    "validation_result": {},
                    
                    "retry_count": 0,
                    
                    "messages": [],
                    
                    "result": {},
                    
                    "errors": [],
                    
                    "warnings": [],
                    
                    "processing_time": 0.0
                }
                
                
                final_state = app.invoke(initial_state)
                
                total_time = (datetime.now() - start_time).total_seconds()
                
                print(f"  EXTRACTION COMPLETE")
            
                print(f"   Total time: {total_time:.2f}s")
            
                print("="*60 + "\n")
                
                return final_state["result"]

output_file = f"license_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
    
with open(output_file, "a") as f:

    for image_path, application in paths:
    
        result = extract_license_info(image_path)
        
        print("FINAL RESULTS")
        
        print("="*60)
        
        print(json.dumps(result, indent=2))
        
        json.dump(result, f)
    
print(f"\nResults saved to: {output_file}")

In [None]:
######
# Flow
#####
items_adapter = TypeAdapter(list[LicenseApplication])

applications = utils.group_files_by_id("https://github.com/agapebondservant/dla_poc", "notebooks/data2")

df = pd.DataFrame(applications).drop('application_data', axis=1)

df["model_name"] = os.getenv("LLAMASCOUT4_LLM_NAME")

dataset = Dataset.from_pandas(df)

flow_path = "flows/drivers_license_validation/flow.yaml"

# columns_to_keep = ["code_id", "code", "markdown", "summary", "summary_type", "eval_summary_relevance", "eval_summary_faithfulness"]

flow = Flow.from_yaml(flow_path)

flow.set_model_config(
    model=os.getenv("LLAMASCOUT4_LLM_NAME"),
    api_base=os.getenv("LLAMASCOUT4_LLM_BASE"),
    api_key=os.getenv("LLAMASCOUT4_LLM_KEY"),
    temperature=0,
    max_tokens = 8192,
    response_format={"type": "json_object"},
    top_k=1,
)

converted_dataset = flow.generate(dataset, max_concurrency=10)

# converted_dataset.to_pandas()

converted_dataset.to_json("dataset_test.jsonl")