# Forms Recogniser Python Example

6th April 2020

Microsoft Forms Recogniser Python Tutorial


https://docs.microsoft.com/en-us/azure/cognitive-services/form-recognizer/quickstarts/python-train-extract

## 1. Load the Required Libraries

In [1]:
# load environment variables
import azure, json, os, requests
import pandas as pd
from dotenv import load_dotenv
import sys
import time
from requests import get, post

In [2]:
load_dotenv(verbose=True)

True

## 2. Train the Model 

In [3]:
########### Python Form Recognizer Labeled Async Train #############

# Endpoint URL
endpoint = r"https://formrecogniserapi.cognitiveservices.azure.com/"
post_url = endpoint + r"/formrecognizer/v2.0-preview/custom/models"
source = r"https://formsstorageamc.blob.core.windows.net/trainingforms?"+os.getenv("SAS_KEY")
prefix = ""
includeSubFolders = False
useLabelFile = False

headers = {
    # Request headers
    'Content-Type': 'application/json',
    'Ocp-Apim-Subscription-Key': os.getenv("SUBKEY"),
}

body = 	{
    "source": source,
    "sourceFilter": {
        "prefix": prefix,
        "includeSubFolders": includeSubFolders
    },
    "useLabelFile": useLabelFile
}

try:
    resp = post(url = post_url, json = body, headers = headers)
    if resp.status_code != 201:
        print("POST model failed (%s):\n%s" % (resp.status_code, json.dumps(resp.json())))
        quit()
    print("POST model succeeded:\n%s" % resp.headers)
    get_url = resp.headers["location"]
except Exception as e:
    print("POST model failed:\n%s" % str(e))
    quit()



POST model succeeded:
{'Content-Length': '0', 'Location': 'https://formrecogniserapi.cognitiveservices.azure.com/formrecognizer/v2.0-preview/custom/models/7d30db5f-5f1a-4db3-b658-67cfd7fae7e7', 'x-envoy-upstream-service-time': '48', 'apim-request-id': 'c08cc555-c579-43b0-b617-b67013580cf7', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'x-content-type-options': 'nosniff', 'Date': 'Tue, 07 Apr 2020 01:06:15 GMT'}


In [4]:
# make sure to save the model ID as a variable which we will use in later steps

n_tries = 15
n_try = 0
wait_sec = 5
max_wait_sec = 60
while n_try < n_tries:
    try:
        resp = get(url = get_url, headers = headers)
        resp_json = resp.json()
        if resp.status_code != 200:
            print("GET model failed (%s):\n%s" % (resp.status_code, json.dumps(resp_json)))
            quit()
        model_status = resp_json["modelInfo"]["status"]
        if model_status == "ready":
            print("Training succeeded:\n%s" % json.dumps(resp_json, indent=4, sort_keys=True))
            modelID = resp_json["modelInfo"]["modelId"]
            print(modelID)
            break
        if model_status == "invalid":
            print("Training failed. Model is invalid:\n%s" % json.dumps(resp_json))
            quit()
        # Training still running. Wait and retry.
        time.sleep(wait_sec)
        n_try += 1
        wait_sec = min(2*wait_sec, max_wait_sec)     
    except Exception as e:
        msg = "GET model failed:\n%s" % str(e)
        print(msg)
        quit()

Training succeeded:
{
    "modelInfo": {
        "createdDateTime": "2020-04-07T01:06:15Z",
        "lastUpdatedDateTime": "2020-04-07T01:06:40Z",
        "modelId": "7d30db5f-5f1a-4db3-b658-67cfd7fae7e7",
        "status": "ready"
    },
    "trainResult": {
        "errors": [],
        "trainingDocuments": [
            {
                "documentName": "SLNI 36 (SLNR 36) - Copy.pdf",
                "errors": [],
                "pages": 1,
                "status": "succeeded"
            },
            {
                "documentName": "SLNI 36 (SLNR 36).pdf",
                "errors": [],
                "pages": 1,
                "status": "succeeded"
            },
            {
                "documentName": "SLNR 37.pdf",
                "errors": [],
                "pages": 1,
                "status": "succeeded"
            },
            {
                "documentName": "VHC7 - Logicapp.pdf",
                "errors": [],
                "pages": 1,
                "

In [5]:
x = os.getenv("file_path")
x

'C:/Users/ancibira/OneDrive - Microsoft/Customers/CSIRO/Plastics Illegal Fishing/FormsRecogniser/First Pages/VHC7.pdf'

## 3. Perform Model Inferencing on a Local Dcoument

In [6]:
# local path of document
source = os.getenv("file_path")

# Endpoint URL
apim_key = os.getenv("SUBKEY")
model_id = modelID
post_url = endpoint + "/formrecognizer/v2.0-preview/custom/models/%s/analyze" % model_id
params = {
    "includeTextDetails": True
}

headers = {
    # Request headers
    'Content-Type': 'application/pdf',
    'Ocp-Apim-Subscription-Key': apim_key,
}
with open(source, "rb") as f:
    data_bytes = f.read()

try:
    resp = post(url = post_url, data = data_bytes, headers = headers, params = params)
    if resp.status_code != 202:
        print("POST analyze failed:\n%s" % json.dumps(resp.json()))
        quit()
    print("POST analyze succeeded:\n%s" % resp.headers)
    get_url = resp.headers["operation-location"]
except Exception as e:
    print("POST analyze failed:\n%s" % str(e))
    quit()
    


POST analyze succeeded:
{'Content-Length': '0', 'Operation-Location': 'https://formrecogniserapi.cognitiveservices.azure.com/formrecognizer/v2.0-preview/custom/models/7d30db5f-5f1a-4db3-b658-67cfd7fae7e7/analyzeresults/52c7e81b-bbac-47aa-8439-c4167166a713', 'x-envoy-upstream-service-time': '227', 'apim-request-id': 'ef3d4ff9-251b-461c-b044-6692d8fd73cd', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'x-content-type-options': 'nosniff', 'Date': 'Tue, 07 Apr 2020 01:06:54 GMT'}


In [7]:
n_tries = 15
n_try = 0
wait_sec = 5
max_wait_sec = 60
while n_try < n_tries:
    try:
        resp = get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": apim_key})
        resp_json = resp.json()
        if resp.status_code != 200:
            print("GET analyze results failed:\n%s" % json.dumps(resp_json))
            quit()
        status = resp_json["status"]
        if status == "succeeded":
            print("Analysis succeeded:\n%s" % json.dumps(resp_json, indent=4, sort_keys=True))
            break
        if status == "failed":
            print("Analysis failed:\n%s" % json.dumps(resp_json))
            quit()
        # Analysis still running. Wait and retry.
        time.sleep(wait_sec)
        n_try += 1
        wait_sec = min(2*wait_sec, max_wait_sec)     
    except Exception as e:
        msg = "GET analyze results failed:\n%s" % str(e)
        print(msg)
        quit()

Analysis succeeded:
{
    "analyzeResult": {
        "documentResults": [],
        "errors": [],
        "pageResults": [
            {
                "clusterId": 0,
                "keyValuePairs": [
                    {
                        "confidence": 0.82,
                        "key": {
                            "boundingBox": [
                                0.5347,
                                3.6319,
                                1.7111,
                                3.6319,
                                1.7111,
                                3.7569,
                                0.5347,
                                3.7569
                            ],
                            "elements": [
                                "#/readResults/0/lines/24/words/0"
                            ],
                            "text": "Location/Municipality"
                        },
                        "value": {
                            "boundingBox