In [4]:

from azure.ai.ml import MLClient, command, Output
from azure.ai.ml.entities import Environment
from azure.identity import DefaultAzureCredential


# Connect to AML workspace
ml_client = MLClient.from_config(DefaultAzureCredential())


# Define environment
donut_env = Environment(
    name="donut-lora-env",
    image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:latest",
    conda_file="environment.yaml"
)

# Register environment
ml_client.environments.create_or_update(donut_env)


job = command(
    code="./src",
    command="python train.py --data_dir ./data --output_dir ${{outputs.model_output}}",
    environment=donut_env,
    compute="anishswiss1",
    display_name="donut-lora-train",
    experiment_name="donut-lora-exp",
    outputs={
        "model_output": Output(type="uri_folder", mode="upload")
    },
)


# Submit
returned_job = ml_client.jobs.create_or_update(job)
print(f"Submitted job: {returned_job.name}")




Found the config file in: /config.json
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


Submitted job: patient_zebra_hlpm8t7gt2


In [1]:
%pip install torch torchvision torchaudio


from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import os

download_path = "./donut_qa_model"


model_folder = os.path.join(download_path, "donutQA/outputs/donut-lora")  # may need adjustment

files_only = [f for f in os.listdir(model_folder) if os.path.isfile(os.path.join(model_folder, f))]
print("FILES .... ")
print(files_only)

processor = DonutProcessor.from_pretrained(model_folder)
model = VisionEncoderDecoderModel.from_pretrained(model_folder)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/anaconda/envs/azureml_py310_sdkv2/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
FILES .... 
['.amlignore', '.amlignore.amltmp', 'added_tokens.json', 'config.json', 'generation_config.json', 'model.safetensors', 'preprocessor_config.json', 'sentencepiece.bpe.model', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json']


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
image = Image.open("test_pay_stub.jpg").convert("RGB")
question = "What is the net pay?"

prompt = f"<s_docvqa><s_question>{question}</s_question><s_answer>"


In [3]:
# Prepare inputs
pixel_values = processor(image, return_tensors="pt").pixel_values
decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids

# Generate prediction
outputs = model.generate(
    pixel_values,
    decoder_input_ids=decoder_input_ids,
    max_length=model.decoder.config.max_position_embeddings,
    early_stopping=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    use_cache=True,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
    return_dict_in_generate=True,
)

# Decode answer
sequence = processor.batch_decode(outputs.sequences)[0]
answer = sequence.split("<s_answer>")[1].split("</s_answer>")[0]

print(f"Predicted answer: {answer}")


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Predicted answer:  $853.30


In [1]:
#conda env update -f environment.yaml

import sys
print(sys.executable)

/anaconda/envs/azureml_py38/bin/python


In [None]:
# Deploy to Azure Container Instances (ACI) - Option 1: No quota issues!
from azure.ai.ml import MLClient
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, Environment
from azure.identity import DefaultAzureCredential
from datetime import datetime

# Create ml_client if it doesn't exist
if 'ml_client' not in globals():
    print("Creating MLClient connection...")
    ml_client = MLClient.from_config(DefaultAzureCredential())

# Get or create donut_env if it doesn't exist
if 'donut_env' not in globals():
    print("Creating environment definition...")
    donut_env = Environment(
        name="donut-lora-env",
        image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:latest",
        conda_file="environment.yaml"
    )
    ml_client.environments.create_or_update(donut_env)

# Get the registered environment
env_version = 22
try:
    registered_env = ml_client.environments.get(donut_env.name, version=str(env_version))
    env_ref = registered_env
    print(f"Using registered environment: {registered_env.name}:{registered_env.version}")
except Exception as e:
    print(f"Warning: Could not get environment version {env_version}. Error: {e}")
    env_ref = f"{donut_env.name}:{env_version}"
    print(f"Using environment reference: {env_ref}")

# Create ManagedOnlineEndpoint (ACI is handled via instance type)
# Use smaller instance types that typically have more quota available
endpoint_name = f"donutqa-aci-{datetime.now().strftime('%m%d%H%M')}"
endpoint = ManagedOnlineEndpoint(
    name=endpoint_name,
    auth_mode="key"
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
print(f"Created endpoint: {endpoint_name}")

# Deploy model with smaller instance type to avoid quota issues
# Try Standard_B2ms (2 vCPUs) or Standard_B4ms (4 vCPUs) - these often have more quota
deployment = ManagedOnlineDeployment(
    name="aci",
    endpoint_name=endpoint_name,
    model="donutQA:1",
    environment=env_ref,
    code_path="src",
    scoring_script="score.py",
    instance_type="Standard_B2ms",  # 2 vCPUs, 8GB RAM - smaller, more likely to have quota
    instance_count=1
)
ml_client.online_deployments.begin_create_or_update(deployment).result()
print("Deployment to ACI created successfully")

# Route traffic
endpoint.traffic = {"aci": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
print("Traffic routed to deployment")

# Get endpoint details
endpoint = ml_client.online_endpoints.get(endpoint_name)
print(f"\n‚úÖ Endpoint deployed successfully!")
print(f"Endpoint name: {endpoint_name}")
print(f"Scoring URI: {endpoint.scoring_uri}")
print(f"Status: {endpoint.provisioning_state}")
print(f"\n‚ö†Ô∏è  Note: Using Standard_B2ms instance (2 vCPUs). If quota error persists, try Standard_B1ms or request quota increase.")
print(f"\nüìù To call the endpoint:")
print(f"   POST {endpoint.scoring_uri}")
print(f"   Headers: {{'Authorization': 'Bearer <key>', 'Content-Type': 'application/json'}}")
print(f"   Body: {{'image': '<base64_encoded_image>', 'question': '<your_question>'}}")



In [None]:
# Alternative: Batch Endpoint - No quota issues, uses your existing compute
# Good for processing multiple documents offline/async
from azure.ai.ml.entities import BatchEndpoint, BatchDeployment
from datetime import datetime

# Create ml_client if it doesn't exist
if 'ml_client' not in globals():
    from azure.ai.ml import MLClient
    from azure.identity import DefaultAzureCredential
    ml_client = MLClient.from_config(DefaultAzureCredential())

# Get environment
if 'donut_env' not in globals():
    from azure.ai.ml.entities import Environment
    donut_env = Environment(
        name="donut-lora-env",
        image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:latest",
        conda_file="environment.yaml"
    )
    ml_client.environments.create_or_update(donut_env)

env_version = 22
try:
    registered_env = ml_client.environments.get(donut_env.name, version=str(env_version))
    env_ref = registered_env
except:
    env_ref = f"{donut_env.name}:{env_version}"

# Create Batch Endpoint (no quota issues - uses your compute)
batch_endpoint_name = f"donutqa-batch-{datetime.now().strftime('%m%d%H%M')}"
batch_endpoint = BatchEndpoint(
    name=batch_endpoint_name,
    description="Batch endpoint for Donut QA - processes documents asynchronously"
)
ml_client.batch_endpoints.begin_create_or_update(batch_endpoint).result()
print(f"Created batch endpoint: {batch_endpoint_name}")

# Deploy to batch endpoint
batch_deployment = BatchDeployment(
    name="batch",
    endpoint_name=batch_endpoint_name,
    model="donutQA:1",
    environment=env_ref,
    code_path="src",
    scoring_script="score.py",
    compute="anishswiss1",  # Uses your existing compute - no quota issues!
    instance_count=1
)
ml_client.batch_deployments.begin_create_or_update(batch_deployment).result()
batch_endpoint.defaults = {"deployment_name": "batch"}
ml_client.batch_endpoints.begin_create_or_update(batch_endpoint).result()

print(f"\n‚úÖ Batch Endpoint deployed!")
print(f"Endpoint name: {batch_endpoint_name}")
print(f"\nüìù Note: Batch endpoints process files/jobs asynchronously.")
print(f"   Submit jobs with: ml_client.batch_endpoints.invoke()")
print(f"   This uses your existing compute 'anishswiss1' - no quota issues!")



In [None]:
# Deploy to Virtual Machine (using your existing compute instance)
# This creates a web service on your VM without quota issues
from azure.ai.ml import MLClient
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, Environment
from azure.identity import DefaultAzureCredential
from datetime import datetime

# Create ml_client if it doesn't exist
if 'ml_client' not in globals():
    print("Creating MLClient connection...")
    ml_client = MLClient.from_config(DefaultAzureCredential())

# Get environment
if 'donut_env' not in globals():
    from azure.ai.ml.entities import Environment
    donut_env = Environment(
        name="donut-lora-env",
        image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:latest",
        conda_file="environment.yaml"
    )
    ml_client.environments.create_or_update(donut_env)

env_version = 22
try:
    registered_env = ml_client.environments.get(donut_env.name, version=str(env_version))
    env_ref = registered_env
    print(f"Using registered environment: {registered_env.name}:{registered_env.version}")
except:
    env_ref = f"{donut_env.name}:{env_version}"

# Create endpoint
endpoint_name = f"donutqa-vm-{datetime.now().strftime('%m%d%H%M')}"
endpoint = ManagedOnlineEndpoint(
    name=endpoint_name,
    auth_mode="key"
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
print(f"Created endpoint: {endpoint_name}")

# Deploy to VM - use compute instance as the target
# Note: This uses the VM's resources, so no additional quota needed
deployment = ManagedOnlineDeployment(
    name="vm",
    endpoint_name=endpoint_name,
    model="donutQA:1",
    environment=env_ref,
    code_path="src",
    scoring_script="score.py",
    # For VM deployment, you can use the compute instance directly
    # But managed endpoints still need instance_type - try smallest available
    instance_type="Standard_B1ms",  # Smallest: 1 vCPU, 2GB RAM
    instance_count=1
)
ml_client.online_deployments.begin_create_or_update(deployment).result()
print("Deployment to VM endpoint created successfully")

# Route traffic
endpoint.traffic = {"vm": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

# Get endpoint details
endpoint = ml_client.online_endpoints.get(endpoint_name)
print(f"\n‚úÖ VM Endpoint deployed!")
print(f"Endpoint name: {endpoint_name}")
print(f"Scoring URI: {endpoint.scoring_uri}")
print(f"\nüìù Note: This still uses managed endpoint infrastructure.")
print(f"   For direct VM deployment, see alternative approach below.")



# Alternative: Deploy directly to your compute instance VM

## Option A: Manual deployment on compute instance
You can SSH into your compute instance (`anishswiss1`) and:
1. Copy your model and `score.py` to the VM
2. Install dependencies
3. Run a Flask/FastAPI server
4. Expose it via the compute instance's endpoint

## Option B: Use Azure ML's local deployment
Deploy locally on the compute instance for testing, then expose via port forwarding.

## Option C: Create a custom VM and deploy there
1. Create a new Azure VM
2. Install Python, dependencies
3. Deploy your model as a web service
4. More control, but you manage everything



In [None]:
# Deploy Flask app directly to your compute instance VM
# This creates a simple web service that runs on your VM

# Step 1: Copy files to compute instance
# You can do this via Azure ML Studio or SSH

# Step 2: SSH into your compute instance and run:
"""
# SSH into compute instance (from Azure ML Studio: Compute -> anishswiss1 -> Terminal)
# Or use: ssh azureuser@<compute-instance-ip>

# Navigate to your project directory
cd /home/azureuser/cloudfiles/code/Users/anishswiss/DonutQA

# Install Flask if not already installed
pip install flask

# Set model path (adjust if needed)
export MODEL_PATH="./donut_qa_model/donutQA/outputs/donut-lora"

# Run the Flask app
python src/app.py

# The service will be available at: http://localhost:5000
# To expose it externally, you may need to:
# 1. Configure network security group rules
# 2. Use Azure ML's compute instance endpoints
# 3. Or use port forwarding
"""

print("""
üìã To deploy to VM directly:

1. The Flask app is in: src/app.py
2. SSH into your compute instance: anishswiss1
3. Navigate to your project folder
4. Run: python src/app.py
5. Service will be at: http://localhost:5000/score

üìù Endpoint usage:
   POST http://localhost:5000/score
   Body: {"image": "<base64_image>", "question": "<your_question>"}
   
üí° To expose externally:
   - Use Azure ML compute instance endpoints (if available)
   - Or configure NSG rules and use public IP
   - Or use Azure Application Gateway
""")



In [1]:
# Deploy the model to a managed online endpoint
from azure.ai.ml import MLClient
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, Environment
from azure.identity import DefaultAzureCredential
from datetime import datetime

# Create ml_client if it doesn't exist (from Cell 1)
if 'ml_client' not in globals():
    print("Creating MLClient connection...")
    ml_client = MLClient.from_config(DefaultAzureCredential())

# Get or create donut_env if it doesn't exist
if 'donut_env' not in globals():
    print("Creating environment definition...")
    donut_env = Environment(
        name="donut-lora-env",
        image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:latest",
        conda_file="environment.yaml"
    )
    # Register it
    ml_client.environments.create_or_update(donut_env)

# Get the registered environment with specific version
# Use version 22 (or get the latest if you prefer)
env_version = 22  # Specify the version number
try:
    registered_env = ml_client.environments.get(donut_env.name, version=str(env_version))
    env_ref = registered_env
    print(f"Using registered environment: {registered_env.name}:{registered_env.version}")
except Exception as e:
    print(f"Warning: Could not get environment version {env_version}. Error: {e}")
    # Fallback: use string format "name:version"
    env_ref = f"{donut_env.name}:{env_version}"
    print(f"Using environment reference: {env_ref}")

# Create endpoint
endpoint_name = f"donutqa-endpoint-{datetime.now().strftime('%m%d%H%M')}"
endpoint = ManagedOnlineEndpoint(
    name=endpoint_name,
    auth_mode="key"
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
print(f"Created endpoint: {endpoint_name}")

# Deploy model
deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=endpoint_name,
    model="donutQA:1",  # String reference to registered model
    environment=env_ref,  # Use registered environment object
    code_path="src",
    scoring_script="score.py",  # Use scoring_script instead of entry_script
    instance_type="Standard_DS3_v2",
    instance_count=1
)
ml_client.online_deployments.begin_create_or_update(deployment).result()
print("Deployment created successfully")

# Route traffic
endpoint.traffic = {"blue": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
print("Traffic routed to deployment")

# Get endpoint details
endpoint = ml_client.online_endpoints.get(endpoint_name)
print(f"\n‚úÖ Endpoint deployed successfully!")
print(f"Endpoint name: {endpoint_name}")
print(f"Scoring URI: {endpoint.scoring_uri}")
print(f"Status: {endpoint.provisioning_state}")


Found the config file in: ./config.json
  mlflow.mismatch._check_version_mismatch()
ActivityCompleted: Activity=OnlineDeployment.BeginCreateOrUpdate, HowEnded=Failure, Duration=4281.22 [ms], Exception=HttpResponseError, ErrorCategory=UserError, ErrorMessage=(BadRequest) The request is invalid.
Code: BadRequest
Message: The request is invalid.
Exception Details:	(InferencingClientCallFailed) {"error":{"code":"Validation","message":"{\"errors\":{\"VmSize\":[\"Not enough quota available for Standard_DS3_v2 in SubscriptionId d00932c2-80b4-43af-8a00-529d3381b7ca. Current usage/limit: 4/6. Additional needed: 8 Please see troubleshooting guide, available here: https://aka.ms/oe-tsg#error-outofquota\"]},\"type\":\"https://tools.ietf.org/html/rfc9110#section-15.5.1\",\"title\":\"One or more validation errors occurred.\",\"status\":400,\"traceId\":\"00-4f58311eb928bc4360c4bcf1dcc6d9fe-8c432ac8f6ddc726-01\"}"}}
	Code: InferencingClientCallFailed
	Message: {"error":{"code":"Validation","message":"

Creating MLClient connection...
Creating environment definition...
Using registered environment: donut-lora-env:22
Created endpoint: donutqa-endpoint-11261848


HttpResponseError: (BadRequest) The request is invalid.
Code: BadRequest
Message: The request is invalid.
Exception Details:	(InferencingClientCallFailed) {"error":{"code":"Validation","message":"{\"errors\":{\"VmSize\":[\"Not enough quota available for Standard_DS3_v2 in SubscriptionId d00932c2-80b4-43af-8a00-529d3381b7ca. Current usage/limit: 4/6. Additional needed: 8 Please see troubleshooting guide, available here: https://aka.ms/oe-tsg#error-outofquota\"]},\"type\":\"https://tools.ietf.org/html/rfc9110#section-15.5.1\",\"title\":\"One or more validation errors occurred.\",\"status\":400,\"traceId\":\"00-4f58311eb928bc4360c4bcf1dcc6d9fe-8c432ac8f6ddc726-01\"}"}}
	Code: InferencingClientCallFailed
	Message: {"error":{"code":"Validation","message":"{\"errors\":{\"VmSize\":[\"Not enough quota available for Standard_DS3_v2 in SubscriptionId d00932c2-80b4-43af-8a00-529d3381b7ca. Current usage/limit: 4/6. Additional needed: 8 Please see troubleshooting guide, available here: https://aka.ms/oe-tsg#error-outofquota\"]},\"type\":\"https://tools.ietf.org/html/rfc9110#section-15.5.1\",\"title\":\"One or more validation errors occurred.\",\"status\":400,\"traceId\":\"00-4f58311eb928bc4360c4bcf1dcc6d9fe-8c432ac8f6ddc726-01\"}"}}
Additional Information:Type: ComponentName
Info: {
    "value": "managementfrontend"
}Type: Correlation
Info: {
    "value": {
        "operation": "4f58311eb928bc4360c4bcf1dcc6d9fe",
        "request": "27fff206363494b1"
    }
}Type: Environment
Info: {
    "value": "eastus2"
}Type: Location
Info: {
    "value": "eastus2"
}Type: Time
Info: {
    "value": "2025-11-26T18:50:13.6853925+00:00"
}