# Train and Test the Form Recognizer service

Trains and tests the Form Recognizer service with your own forms. Note that there is also a prebuilt model for receipts. See [here](https://docs.microsoft.com/en-us/azure/cognitive-services/form-recognizer/quickstarts/python-receipts) for more infos.

## Prerequisites

- Azure subscription
- [Form Recognizer account](https://aka.ms/FormRecognizerRequestAccess)
- [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) (don't forget to update if you haven't for a long time)
- Train and Test files - see [here](https://docs.microsoft.com/en-us/azure/cognitive-services/form-recognizer/quickstarts/python-train-extract#train-a-form-recognizer-model) for current details
    * Training documents should have the same type/structure
    * At least 5 documents or 2 and an empty form
    * Max. 50 training pages

In [1]:
import random
import string
import re
import os
import glob

## Configuration

Note: Non-existing resources will be created.

In [2]:
local_train_files_directory = "Train"
local_test_files_directory = "Test"

azure_region = "westeurope"
resource_group = "form-recognizer-spike"
storage_account = re.sub("[^\d\w]", "", "{}{}".format(resource_group[:15], ''.join([random.choice(string.ascii_lowercase + string.digits) for n in range(8)])))
train_container = "train"

form_recognizer_endpoint = "https://<add your own Form Recognizer service name here>.cognitiveservices.azure.com/"
form_recognizer_subscription_key = "<add your own Form Recognizer key here>"

## Initialization

In [3]:
train_container_url = "https://{}.blob.core.windows.net/{}".format(storage_account, train_container)

## Document Upload
Note: Required. As of now (Nov 2019), Form Recognizer supports only documents from a blob store. Upload can take a while.

In [4]:
# create resource group
!az group create --name $resource_group --location $azure_region

# create storage account and training file container
!az storage account create --name $storage_account --kind StorageV2 --resource-group $resource_group --location $azure_region
!az storage container create --name $train_container --account-name $storage_account

{
  "id": "/subscriptions/e13f4dad-910a-4301-bcca-9f47839dccf3/resourceGroups/form-recognizer-spike",
  "location": "westeurope",
  "managedBy": null,
  "name": "form-recognizer-spike",
  "properties": {
    "provisioningState": "Succeeded"
  },
  "tags": null,
  "type": "Microsoft.Resources/resourceGroups"
}
{
  "accessTier": "Hot",
  "azureFilesIdentityBasedAuthentication": null,
  "creationTime": "2019-11-07T10:29:53.397013+00:00",
  "customDomain": null,
  "enableHttpsTrafficOnly": true,
  "encryption": {
    "keySource": "Microsoft.Storage",
    "keyVaultProperties": null,
    "services": {
      "blob": {
        "enabled": true,
        "lastEnabledTime": "2019-11-07T10:29:53.459533+00:00"
      },
      "file": {
        "enabled": true,
        "lastEnabledTime": "2019-11-07T10:29:53.459533+00:00"
      },
      "queue": null,
      "table": null
    }
  },
  "failoverInProgress": null,
  "geoReplicationStats": null,
  "id": "/subscriptions/e13f4dad-910a-4301-bcca-9f47839dccf3



{
  "created": true
}


In [5]:
# upload training files
# note: Jupyter stucks here sometimes. if that's the case, upload the files manually, eg. using Storage Explorer
!az storage blob upload-batch --source $local_train_files_directory --destination $train_container --account-name $storage_account

[
  {
    "Blob": "https://formrecognizervorkde08.blob.core.windows.net/train/Invoice_1.pdf",
    "Last Modified": "2019-11-07T10:30:22+00:00",
    "Type": "application/pdf",
    "eTag": "\"0x8D7636D7E888EDF\""
  },
  {
    "Blob": "https://formrecognizervorkde08.blob.core.windows.net/train/Invoice_2.pdf",
    "Last Modified": "2019-11-07T10:30:22+00:00",
    "Type": "application/pdf",
    "eTag": "\"0x8D7636D7E982220\""
  },
  {
    "Blob": "https://formrecognizervorkde08.blob.core.windows.net/train/Invoice_3.pdf",
    "Last Modified": "2019-11-07T10:30:22+00:00",
    "Type": "application/pdf",
    "eTag": "\"0x8D7636D7EB105E9\""
  },
  {
    "Blob": "https://formrecognizervorkde08.blob.core.windows.net/train/Invoice_4.pdf",
    "Last Modified": "2019-11-07T10:30:22+00:00",
    "Type": "application/pdf",
    "eTag": "\"0x8D7636D7EC1AACA\""
  },
  {
    "Blob": "https://formrecognizervorkde08.blob.core.windows.net/train/Invoice_5.pdf",
    "Last Modified": "2019-11-07T10:30:22+00:00",



1/5: "Invoice_1.pdf"[#################################################]  100.0000%
2/5: "Invoice_2.pdf"[#################################################]  100.0000%
3/5: "Invoice_3.pdf"[#################################################]  100.0000%
4/5: "Invoice_4.pdf"[#################################################]  100.0000%
5/5: "Invoice_5.pdf"[#################################################]  100.0000%
Finished[#############################################################]  100.0000%


In [6]:
# generate SAS for training files container
train_container_sas = !az storage container generate-sas --name $train_container --account-name $storage_account --https-only --permissions lr --expiry 2030-01-01
train_container_sas = re.sub(r"\"", "", train_container_sas[0])

## Training

In [7]:
def train_model(form_recognizer_endpoint, form_recognizer_subscription_key, train_container_url_sas):
    from requests import post as http_post
    import json

    base_url = form_recognizer_endpoint + "/formrecognizer/v1.0-preview/custom"
    source = train_container_url_sas
    headers = {
        'Content-Type': 'application/json',
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key,
    }
    url = base_url + "/train" 
    body = {"source": source}
    try:
        resp = http_post(url = url, json = body, headers = headers)
        resp_json = resp.json()
        print("Response status code: %d" % resp.status_code)
        print("Response body:")
        print(json.dumps(resp_json, indent=2))
        return resp_json["modelId"] if "modelId" in resp_json else None
    except Exception as e:
        print(str(e))

train_container_url_sas = "{}?{}".format(train_container_url, train_container_sas)
form_recognizer_model_id = train_model(form_recognizer_endpoint, form_recognizer_subscription_key, train_container_url_sas)

Response status code: 200
Response body:
{
  "modelId": "f9da1958-1362-4e73-bb3e-8444314b9b61",
  "trainingDocuments": [
    {
      "documentName": "Invoice_1.pdf",
      "pages": 1,
      "errors": [],
      "status": "success"
    },
    {
      "documentName": "Invoice_2.pdf",
      "pages": 1,
      "errors": [],
      "status": "success"
    },
    {
      "documentName": "Invoice_3.pdf",
      "pages": 1,
      "errors": [],
      "status": "success"
    },
    {
      "documentName": "Invoice_4.pdf",
      "pages": 1,
      "errors": [],
      "status": "success"
    },
    {
      "documentName": "Invoice_5.pdf",
      "pages": 1,
      "errors": [],
      "status": "success"
    }
  ],
  "errors": []
}


## Testing

In [8]:
# define extraction function
def extract_data(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id, file_path, file_type):
    from requests import post as http_post
    import json

    # Endpoint URL
    base_url = form_recognizer_endpoint + "/formrecognizer/v1.0-preview/custom"
    model_id = form_recognizer_model_id
    headers = {
        # Request headers
        'Content-Type': file_type,
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key,
    }

    try:
        url = base_url + "/models/" + model_id + "/analyze" 
        with open(file_path, "rb") as f:
            data_bytes = f.read()  
        resp = http_post(url = url, data = data_bytes, headers = headers)
        print("File: %s" % file_path)
        print("Response status code: %d" % resp.status_code)
        print("Response body:")
        print(json.dumps(resp.json(), indent=2))
        return resp.json()        
    except Exception as e:
        print(str(e))

# iterate through all relevant files in the test directory and submit each file to Form Recognizer
file_type_mapping = {
    "pdf": "application/pdf",
    "png": "application/png",
    "jpg": "application/jpeg"
}
for file_path in glob.iglob(local_test_files_directory + "/**/*", recursive=True):
    file_extension = os.path.splitext(file_path)[1][1:].lower()
    if file_extension in ["pdf", "png", "jpg"]:
        file_type = file_type_mapping[file_extension]
        extracted_data = extract_data(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id, file_path, file_type)
        print("")
    else:
        continue

File: Test\Invoice_6.pdf
Response status code: 200
Response body:
{
  "status": "success",
  "pages": [
    {
      "number": 1,
      "height": 792,
      "width": 612,
      "clusterId": 0,
      "keyValuePairs": [
        {
          "key": [
            {
              "text": "Address:",
              "boundingBox": [
                57.3,
                683.0,
                100.5,
                683.0,
                100.5,
                673.7,
                57.3,
                673.7
              ]
            }
          ],
          "value": [
            {
              "text": "14564 Main St.",
              "boundingBox": [
                57.3,
                672.3,
                124.3,
                672.3,
                124.3,
                658.7,
                57.3,
                658.7
              ],
              "confidence": 0.86
            },
            {
              "text": "Saratoga, CA 94588",
              "boundingBox": [
          

## Cleanup

In [None]:
# define function to delete a model
def delete_model(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id):
    from requests import delete as http_delete
    import json

    # model URL
    model_url = form_recognizer_endpoint + "/formrecognizer/v1.0-preview/custom/models/" + form_recognizer_model_id
    headers = {
        # Request headers
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key
    }

    try:
        resp = http_delete(url = model_url, headers = headers)
        print("Response status code: %d" % resp.status_code)
        try:
            resp_body = json.dumps(resp.json(), indent=2)
            print("Response body:")
            print(resp_body)
            return resp.json()
        except:
            return None
    except Exception as e:
        print(str(e))
        
delete_model(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id)

In [9]:
# delete resource group in Azure
!az group delete --name $resource_group --yes