# Train and Test the Form Recognizer service

Trains and tests the Form Recognizer service with your own forms. Note that there is also a prebuilt model for receipts. See [here](https://docs.microsoft.com/en-us/azure/cognitive-services/form-recognizer/quickstarts/python-receipts) for more infos.

## Prerequisites

- Jupyter, Python 3.6, pandas, tabulate
- Azure subscription
- [Form Recognizer account](https://aka.ms/FormRecognizerRequestAccess)
- [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) (don't forget to update if you haven't for a long time)
- Train and Test files - see [here](https://docs.microsoft.com/en-us/azure/cognitive-services/form-recognizer/quickstarts/python-train-extract#train-a-form-recognizer-model) for current details
    * Training documents should have the same type/structure
    * At least 5 documents or 2 and an empty form
    * Max. 50 training pages

In [None]:
import random
import string
import re
import os
import glob
import json
import pandas as pd
import tabulate
from requests import post as http_post
from requests import delete as http_delete
from tabulate import tabulate

## Configuration

Note: Non-existing resources will be created.

In [None]:
local_train_files_directory = "Train"
local_test_files_directory = "Test"

azure_region = "westeurope"
resource_group = "form-recognizer-spike"
storage_account = re.sub("[^\d\w]", "", "{}{}".format(resource_group[:15], ''.join([random.choice(string.ascii_lowercase + string.digits) for n in range(8)])))
train_container = "train"

form_recognizer_endpoint = "https://<add your own Form Recognizer service name here>.cognitiveservices.azure.com/"
form_recognizer_subscription_key = "<add your own Form Recognizer key here>"

## Initialization

In [None]:
train_container_url = "https://{}.blob.core.windows.net/{}".format(storage_account, train_container)

## Document Upload
Note: Required. As of now (Nov 2019), Form Recognizer supports only documents from a blob store. Upload can take a while.

In [None]:
# create resource group
!az group create --name $resource_group --location $azure_region

# create storage account and training file container
!az storage account create --name $storage_account --kind StorageV2 --resource-group $resource_group --location $azure_region
!az storage container create --name $train_container --account-name $storage_account

In [None]:
# upload training files
# note: Jupyter stucks here sometimes. if that's the case, upload the files manually, eg. using Storage Explorer
!az storage blob upload-batch --source $local_train_files_directory --destination $train_container --account-name $storage_account

In [None]:
# generate SAS for training files container
train_container_sas = !az storage container generate-sas --name $train_container --account-name $storage_account --https-only --permissions lr --expiry 2030-01-01
train_container_sas = re.sub(r"\"", "", train_container_sas[0])

## Training

In [None]:
def train_model(form_recognizer_endpoint, form_recognizer_subscription_key, train_container_url_sas):
    base_url = form_recognizer_endpoint + "/formrecognizer/v1.0-preview/custom"
    source = train_container_url_sas
    headers = {
        'Content-Type': 'application/json',
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key,
    }
    url = base_url + "/train" 
    body = {"source": source}
    try:
        resp = http_post(url = url, json = body, headers = headers)
        resp_json = resp.json()
        print("Response status code: %d" % resp.status_code)
        print("Response body:")
        print(json.dumps(resp_json, indent=2))
        return resp_json["modelId"] if "modelId" in resp_json else None
    except Exception as e:
        print(str(e))

train_container_url_sas = "{}?{}".format(train_container_url, train_container_sas)
form_recognizer_model_id = train_model(form_recognizer_endpoint, form_recognizer_subscription_key, train_container_url_sas)
print("")
print("Form Recognizer Model ID: {}".format(form_recognizer_model_id))

## Testing

In [None]:
# define extraction function
def extract_data(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id, file_path, file_type):
    # Endpoint URL
    base_url = form_recognizer_endpoint + "/formrecognizer/v1.0-preview/custom"
    model_id = form_recognizer_model_id
    headers = {
        # Request headers
        'Content-Type': file_type,
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key,
    }

    try:
        url = base_url + "/models/" + model_id + "/analyze" 
        with open(file_path, "rb") as f:
            data_bytes = f.read()  
        resp = http_post(url = url, data = data_bytes, headers = headers)
        return resp.status_code, resp.json()        
    except Exception as e:
        print(str(e))

# extracts the identified key-value pairs into a Pandas dataframe
def extract_key_value_pairs(response):
    if response["status"] == "success":
        result = pd.DataFrame(columns=["Page", "Key", "Value"])
        result = result.astype({"Page": int, "Key": str, "Value": str})
        for page in response["pages"]:
            page_number = page["number"]
            for key_value_pair in page["keyValuePairs"]:
                page = page_number
                key = " ".join([value["text"] for value in key_value_pair["key"]])
                value = " ".join([value["text"] for value in key_value_pair["value"]])
                result = result.append(pd.DataFrame({"Page": [page], "Key": [key], "Value": [value]}), ignore_index=True, sort=False)
        return result

# extracts the identified tables into a list of dataframes
def extract_tables(response):
    if response["status"] == "success":
        for page in response["pages"]:
            page_number = page["number"]
            for table in page["tables"]:
                # create data frame
                table_df = pd.DataFrame(        
                    {
                        " ".join([header_element["text"] for header_element in column["header"]]) :
                            [" ".join([sub_entry_element["text"] for sub_entry_element in entry_element]) for entry_element in column["entries"]]
                        for column in table["columns"]
                    })
                # add page number as first column            
                table_df["Page"] = page_number
                cols = table_df.columns.tolist()
                cols.insert(0, cols.pop(cols.index("Page")))
                table_df = table_df.reindex(columns=cols)

                # yield table
                yield table_df

# iterate through all relevant files in the test directory and submit each file to Form Recognizer
file_type_mapping = {
    "pdf": "application/pdf",
    "png": "application/png",
    "jpg": "application/jpeg"
}
for file_path in glob.iglob(local_test_files_directory + "/**/*", recursive=True):
    file_extension = os.path.splitext(file_path)[1][1:].lower()
    if file_extension in ["pdf", "png", "jpg"]:
        file_type = file_type_mapping[file_extension]

        status_code, extracted_data = extract_data(form_recognizer_endpoint, form_recognizer_subscription_key,
            form_recognizer_model_id, file_path, file_type)
        
        print("File: {}".format(file_path))
        
        # remove comments to show full JSON response 
        #print("Response:")
        #print(json.dumps(extracted_data, indent=2))
        
        if status_code == 200:
            print("")
            print("Extracted Key-Value Pairs:")
            print(tabulate(extract_key_value_pairs(extracted_data), headers='keys', tablefmt='psql', showindex="never"))
            
            print("")
            print("Extracted Tables:")
            for extracted_table in extract_tables(extracted_data):
                print(tabulate(extracted_table, headers='keys', tablefmt='psql', showindex="never"))
        else:
            print(json.dumps(extracted_data, indent=2))

        print("")
        print("")
    else:
        continue

## Cleanup

In [None]:
# define function to delete a model
def delete_model(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id):
    # model URL
    model_url = form_recognizer_endpoint + "/formrecognizer/v1.0-preview/custom/models/" + form_recognizer_model_id
    headers = {
        # Request headers
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key
    }

    try:
        resp = http_delete(url = model_url, headers = headers)
        print("Response status code: %d" % resp.status_code)
        try:
            resp_body = json.dumps(resp.json(), indent=2)
            print("Response body:")
            print(resp_body)
            return resp.json()
        except:
            return None
    except Exception as e:
        print(str(e))
        
delete_model(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id)

In [None]:
# delete resource group in Azure
!az group delete --name $resource_group --yes