# Pixtral Model Evaluation

This notebook evaluates the Pixtral-12B model's performance across different quantization levels
and prompt strategies for invoice data extraction.

## Environment Setup
### Import system dependencies

In [None]:

import os
import sys
import subprocess
from pathlib import Path
import logging
import json

# Install dependencies
print("Installing dependencies...")
try:
    # Install base requirements first
    print("Installing base requirements...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-r", str(ROOT_DIR / "requirements.txt")])
    print("Base requirements installed successfully.")

        # Install PyTorch dependencies separately
    print("Installing PyTorch dependencies...")
    subprocess.check_call([
        sys.executable, "-m", "pip", "install", "-q",
        "torch==2.1.0",
        "torchvision==0.16.0",
        "torchaudio==2.1.0",
        "--index-url", "https://download.pytorch.org/whl/cu118"
    ])
    print("PyTorch dependencies installed successfully.")
    
    # Install AI-specific dependencies
    print("Installing AI-specific dependencies...")
    subprocess.check_call([
        sys.executable, "-m", "pip", "install", "-q",
        "transformers==4.50.3",
        "accelerate>=0.26.0",
        "bitsandbytes==0.45.5",
        "huggingface_hub>=0.20.3",
        "flash-attn==2.5.0"
    ])
    print("AI-specific dependencies installed successfully.")
except subprocess.CalledProcessError as e:
    logger.error(f"Error installing dependencies: {e}")
    raise




### Configure Logging

In [None]:
# Configure logging 
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

### Determine Root

In [None]:
# Determine root directory
try:
    # When running as a script
    ROOT_DIR = Path(__file__).parent.parent
except NameError:
    # When running in a notebook, look for project root markers
    current_dir = Path.cwd()
    while current_dir != current_dir.parent:
        if (current_dir / 'src').exists() and (current_dir / 'notebooks').exists():
            ROOT_DIR = current_dir
            break
        current_dir = current_dir.parent
    else:
        raise RuntimeError("Could not find project root directory. Make sure you're running from within the project structure.")

sys.path.append(str(ROOT_DIR))

### Setup Paths

In [None]:
# Setup Paths
try:
    # Define base directories
    env = {
        'data_dir': ROOT_DIR / 'data',
        'models_dir': ROOT_DIR / 'models',
        'logs_dir': ROOT_DIR / 'logs',
        'results_dir': ROOT_DIR / 'results',
        'prompts_dir': ROOT_DIR / 'prompts',
        'config_dir': ROOT_DIR / 'config'
    }
    
    # Validate paths
    required_paths = ['data_dir', 'models_dir', 'logs_dir', 'results_dir', 'prompts_dir', 'config_dir']
    missing_paths = [path for path in required_paths if path not in env]
    if missing_paths:
        raise RuntimeError(f"Missing required paths in environment: {missing_paths}")
        
    # Ensure required directories exist
    for path in required_paths:
        env[path].mkdir(parents=True, exist_ok=True)
        logger.info(f"Created/verified directory: {env[path]}")
    
    # Create subdirectories for results
    (env['results_dir'] / 'raw_results').mkdir(parents=True, exist_ok=True)
    (env['results_dir'] / 'processed_results').mkdir(parents=True, exist_ok=True)
    
    logger.info("All required directories have been set up successfully")
    
except Exception as e:
    logger.error(f"Error setting up environment: {str(e)}")
    raise

### Import Local Code

In [None]:

# Import project modules
from src import execution
from src.environment import setup_environment, download_model
from src.config import load_yaml_config
from src.models.llama_vision import load_model, process_image_wrapper, download_llama_vision_model
from src.prompts import load_prompt_template
from src.results_logging import track_execution, log_result, ResultStructure, evaluate_model_output
from src.validation import validate_results
from src.data_utils import DataConfig, setup_data_paths

### Confugure Systems

In [None]:

# Load configuration
config_path = ROOT_DIR / "config" / "models" / "pixtral.yaml"
if not config_path.exists():
    raise FileNotFoundError(f"Configuration file not found: {config_path}")

try:
    config = load_yaml_config(str(config_path))
    # Validate required configuration sections
    required_sections = ['name', 'loading', 'quantization', 'prompt', 'inference']
    missing_sections = [section for section in required_sections if section not in config]
    if missing_sections:
        raise ValueError(f"Configuration missing required sections: {missing_sections}")
except Exception as e:
    logger.error(f"Error loading configuration: {str(e)}")
    raise

# Setup data configuration
try:
    data_config = setup_data_paths(
        env_config=env,
        image_extensions=['.jpg', '.jpeg', '.png'],
        max_image_size=1120,
        supported_formats=['RGB', 'L']
    )
    logger.info("Data configuration setup successfully")
except Exception as e:
    logger.error(f"Error setting up data configuration: {str(e)}")
    raise

# Load model configuration
try:
    # The config is already loaded and validated with required sections
    # We can use the config directly as it matches our needs
    model_config = {
        'name': config['name'],
        'path': config['repo_id'],
        'quantization_levels': list(config['quantization']['options'].keys())
    }
    
    prompt_config = {
        'format': config['prompt']['format'],
        'image_placeholder': config['prompt']['image_placeholder'],
        'default_field': config['prompt']['default_field']
    }
    
    # Validate model configuration
    required_model_fields = ['name', 'path', 'quantization_levels']
    missing_fields = [field for field in required_model_fields if field not in model_config]
    if missing_fields:
        raise ValueError(f"Model configuration missing required fields: {missing_fields}")
        
except KeyError as e:
    logger.error(f"Missing required configuration section: {e}")
    raise
except Exception as e:
    logger.error(f"Error loading model configuration: {str(e)}")
    raise

print(f"✓ Model configuration loaded successfully for {MODEL_NAME}")

# Set model for this notebook
MODEL_NAME = "pixtral"
TEST_MATRIX_PATH = str(ROOT_DIR / "config" / "test_matrix.json")
EXECUTION_LOG_PATH = env['logs_dir'] / f"{MODEL_NAME}_execution.log"

# Validate test matrix exists and is valid
try:
    if not Path(TEST_MATRIX_PATH).exists():
        raise FileNotFoundError(f"Test matrix file not found: {TEST_MATRIX_PATH}")
        
    # Load and validate test matrix
    with open(TEST_MATRIX_PATH, 'r') as f:
        test_matrix = json.load(f)
        
    # Validate test matrix structure
    if 'test_cases' not in test_matrix:
        raise ValueError("Test matrix must contain 'test_cases' array")
        
    # Validate required fields
    required_fields = ['model_name', 'field_type', 'prompt_type', 'quant_level', 'image_path']
    for test_case in test_matrix['test_cases']:
        missing_fields = [field for field in required_fields if field not in test_case]
        if missing_fields:
            raise ValueError(f"Test case missing required fields: {missing_fields}")
            
    # Validate quantization values
    valid_quantization = [4, 8, 16, 32]
    invalid_quantization = [case['quant_level'] for case in test_matrix['test_cases'] 
                          if case['quant_level'] not in valid_quantization]
    if invalid_quantization:
        raise ValueError(f"Invalid quantization values found: {invalid_quantization}")
            
except Exception as e:
    logger.error(f"Error validating test matrix: {str(e)}")
    raise

## Test Execution


### Configure the Test

In [None]:
# User Configuration
print("Configure your test parameters:")

# Get quantization level
while True:
    try:
        QUANTIZATION_LEVEL = int(input("Enter quantization level (4, 8, 16, or 32): "))
        if QUANTIZATION_LEVEL not in [4, 8, 16, 32]:
            print("Invalid quantization level. Please enter 4, 8, 16, or 32.")
            continue
        break
    except ValueError:
        print("Please enter a valid number.")

# Get prompt type
available_prompts = [
    "basic_extraction",  # From basic_extraction.yaml
    "detailed",          # From detailed.yaml
    "few_shot",         # From few_shot.yaml
    "locational",       # From locational.yaml
    "step_by_step"      # From step_by_step.yaml
]

print(f"\nAvailable prompt types:")
for i, prompt in enumerate(available_prompts, 1):
    print(f"{i}. {prompt}")

while True:
    try:
        choice = int(input("\nEnter the number of your chosen prompt type: "))
        if 1 <= choice <= len(available_prompts):
            PROMPT_TYPE = available_prompts[choice - 1]
            break
        print(f"Please enter a number between 1 and {len(available_prompts)}")
    except ValueError:
        print("Please enter a valid number.")

# Set other required variables
MODEL_NAME = "pixtral"
FIELD_TYPES = ["work_order_number", "total_cost"]
TEST_MATRIX_PATH = env['config_dir'] / "test_matrix.json"
RAW_RESULTS_DIR = env['results_dir'] / "raw_results" / f"{MODEL_NAME}_{QUANTIZATION_LEVEL}bit_{PROMPT_TYPE}"
RAW_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("\nTest Configuration:")
print(f"Model: {MODEL_NAME}")
print(f"Quantization: {QUANTIZATION_LEVEL} bits")
print(f"Prompt Type: {PROMPT_TYPE}")
print(f"Field Types: {', '.join(FIELD_TYPES)}")
print(f"Results will be saved to: {RAW_RESULTS_DIR}")

### Run the Test

In [None]:
# Add this before running the test suite
for test_case in test_matrix['test_cases']:
    image_path = Path(test_case['image_path'])
    if not image_path.exists():
        raise FileNotFoundError(f"Image not found: {image_path}")


# Run test suite
try:
    # Create test matrix for this run
    test_matrix = {
        "test_cases": [
            {
                "model_name": MODEL_NAME,
                "field_type": field_type,
                "prompt_type": PROMPT_TYPE,
                "quant_level": QUANTIZATION_LEVEL,
                "image_path": str(env['data_dir'] / f"{1017 + i}.jpg")  # Start from 1017.jpg
            }
            for i, field_type in enumerate(FIELD_TYPES)
        ]
    }
    
    # Save test matrix
    with open(TEST_MATRIX_PATH, 'w') as f:
        json.dump(test_matrix, f, indent=2)
    
    # Run raw test suite
    run_raw_test_suite(
        model_loader=load_model,
        test_matrix=test_matrix['test_cases'],
        output_dir=RAW_RESULTS_DIR,
        prompts_dir=env['prompts_dir']
    )
    
    print(f"✓ Raw test execution completed successfully for:")
    print(f"- Model: {MODEL_NAME}")
    print(f"- Quantization: {QUANTIZATION_LEVEL} bits")
    print(f"- Prompt Type: {PROMPT_TYPE}")
    print(f"- Field Types: {', '.join(FIELD_TYPES)}")
    
except Exception as e:
    logger.error(f"Error in raw test execution: {str(e)}")
    raise

## Phase 2: Result Processing

Process the raw results and generate analysis.

In [None]:
# Process results
try:
    summary = process_results(
        results=results,
        output_dir=PROCESSED_RESULTS_DIR
    )
    print("✓ Result processing completed successfully")
except Exception as e:
    logger.error(f"Error in result processing: {str(e)}")
    raise

## Analysis and Visualization

Display the processed results.

In [None]:
# Display metrics
print("\nPerformance Metrics:")
print(f"Total Tests: {summary['total_tests']}")
print(f"Successful Tests: {summary['successful_tests']}")
print(f"Failed Tests: {summary['failed_tests']}")
print(f"Average Processing Time: {summary['average_processing_time']:.2f}s")

print("\nField Accuracy:")
for field, metrics in summary['field_accuracy'].items():
    print(f"{field}: {metrics['accuracy']:.2%} ({metrics['success']}/{metrics['total']})") 