# Train and Test the Form Recognizer service

Trains and tests the Form Recognizer service with your own forms, using preview version 2.0.

Note that Form Recognizer has more features than shown here. Check the [documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/form-recognizer) for a full overview.

## Prerequisites

- Jupyter, Python 3.6, pandas, tabulate
- Azure subscription
- [Form Recognizer account](https://aka.ms/FormRecognizerRequestAccess)
- [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) (don't forget to update if you haven't for a long time)
- Train and Test files - see the [input requirements](https://docs.microsoft.com/en-us/azure/cognitive-services/form-recognizer/overview#input-requirements) for more details.

In [None]:
import random
import string
import time
import re
import os
import glob
import json
import pandas as pd
import tabulate
from requests import post as http_post
from requests import delete as http_delete
from requests import get as http_get
from tabulate import tabulate

## Configuration

Note: Non-existing resources will be created.

In [None]:
local_train_files_directory = "Train"
local_test_files_directory = "Test"

azure_region = "westeurope"
resource_group = "form-recognizer-spike"
storage_account = re.sub("[^\d\w]", "", "{}{}".format(resource_group[:15], ''.join([random.choice(string.ascii_lowercase + string.digits) for n in range(8)])))
train_container = "train"

form_recognizer_endpoint = "https://<add your own Form Recognizer service name here>.cognitiveservices.azure.com/"
form_recognizer_subscription_key = "<add your own Form Recognizer key here>"

## Initialization

In [None]:
train_container_url = "https://{}.blob.core.windows.net/{}".format(storage_account, train_container)

## Document Upload
Note: Required. As of now (Nov 2019), Form Recognizer supports only documents from a blob store. Upload can take a while.

In [None]:
# create resource group
!az group create --name $resource_group --location $azure_region

# create storage account and training file container
!az storage account create --name $storage_account --kind StorageV2 --resource-group $resource_group --location $azure_region
!az storage container create --name $train_container --account-name $storage_account

In [None]:
# upload training files
# note: Jupyter stucks here sometimes. if that's the case, upload the files manually, eg. using Storage Explorer
!az storage blob upload-batch --source $local_train_files_directory --destination $train_container --account-name $storage_account

In [None]:
# generate SAS for training files container
train_container_sas = !az storage container generate-sas --name $train_container --account-name $storage_account --https-only --permissions lr --expiry 2030-01-01
train_container_sas = re.sub(r"\"", "", train_container_sas[0])

## Training

In [None]:
def train_model(form_recognizer_endpoint, form_recognizer_subscription_key, train_container_url_sas):
    try:
        # trigger training
        print("Triggering model training...")
        base_url = form_recognizer_endpoint + "formrecognizer/v2.0-preview/"
        target_url = base_url + "custom/models"
        source = train_container_url_sas
        headers = {
            'Content-Type': 'application/json',
            'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key,
        }
        body = {
            "source": source
        }
        trigger_model_training_response = http_post(url = target_url, json = body, headers = headers)
        if trigger_model_training_response.status_code != 201:
             raise Exception("Model training failed. Got wrong status code: {}. Expected was: 201.".format(
                 trigger_model_training_response.status_code))


        # wait until training is completed
        print("Waiting for completion...")
        model_status_url = trigger_model_training_response.headers["location"]
        while True:
            model_status_response = http_get(url = model_status_url, headers = headers)
            model_status_response_json = model_status_response.json()
            if model_status_response.status_code != 200:                    
                raise Exception("Could not query model training status. Status Code: %s. Message:\n%s" %
                                (model_status_response.status_code, json.dumps(model_status_response_json, indent=2)))

            model_status = model_status_response_json["modelInfo"]["status"]

            if model_status == "ready":              
                print("Training succeeded:\n%s" % json.dumps(model_status_response_json, indent=2))
                return model_status_response_json["modelInfo"]["modelId"]

            if model_status == "invalid":
                raise Exception("Model training failed. Response:\n%s" % json.dumps(model_status_response_json, indent=2))

            time.sleep(2)
    except Exception as e:
        print(str(e))
        raise

train_container_url_sas = "{}?{}".format(train_container_url, train_container_sas)
form_recognizer_model_id = train_model(form_recognizer_endpoint, form_recognizer_subscription_key, train_container_url_sas)
print("")
print("Form Recognizer Model ID: {}".format(form_recognizer_model_id))

## Testing

In [None]:
# define function for the form analysis
def analyze_form(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id, file_path, file_type):
    request_url = "{endpoint}/formrecognizer/v2.0-preview/custom/models/{modelId}/analyze".format(
        endpoint = form_recognizer_endpoint,
        modelId  = form_recognizer_model_id
    )   
    headers = {
        'Content-Type': file_type,
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key,
    }

    try:
        with open(file_path, "rb") as f:
            data_bytes = f.read()  
        analyze_form_response = http_post(url = request_url, data = data_bytes, headers = headers)

        if analyze_form_response.status_code != 202:
             raise Exception("Analysis of form failed. Got wrong status code: {}. Expected was: 202.".format(
                 analyze_form_response.status_code))
               
        analyze_form_status_url = analyze_form_response.headers["operation-location"]
        while True:
            analyze_form_status_response = http_get(url = analyze_form_status_url, headers = headers)
            analyze_form_status_response_json = analyze_form_status_response.json()
            if analyze_form_status_response.status_code != 200:                    
                raise Exception("Could not analyze form. Status Code: %s. Message:\n%s" %
                                (analyze_form_status_response.status_code, json.dumps(analyze_form_status_response_json, indent=2)))

            analyze_form_status = analyze_form_status_response_json["status"]

            if analyze_form_status == "succeeded":
                return analyze_form_status_response_json         

            if analyze_form_status == "failed":
                raise Exception("Analysis of form failed. Response:\n%s" % json.dumps(analyze_form_status_response_json, indent=2))

            time.sleep(1)
    
    except Exception as e:
        print(str(e))
        raise

# extracts the identified key-value pairs into a Pandas dataframe
def extract_key_value_pairs(response):
    if response["status"] == "succeeded":       
        result = pd.DataFrame(columns=["Page", "Key", "Value", "Confidence"])
        result = result.astype({"Page": int, "Key": str, "Value": str, "Confidence": float})
        for page in response["analyzeResult"]["pageResults"]:
            page_number = page["page"]
            for key_value_pair in page["keyValuePairs"]:
                page = page_number
                key = key_value_pair["key"]["text"]
                value = key_value_pair["value"]["text"]
                confidence = key_value_pair["confidence"]
                result = result.append(pd.DataFrame({"Page": [page], "Key": [key], "Value": [value],
                                                     "Confidence": [confidence]}), ignore_index=True, sort=False)
        return result

# extracts the identified tables into a list of dataframes
def extract_tables(response):
    if response["status"] == "succeeded":
        for page in response["analyzeResult"]["pageResults"]:
            page_number = page["page"]
            for table in page["tables"]:
                # create dataframe from extracted data
                table_df = pd.DataFrame(table["cells"])[
                    ["rowIndex", "columnIndex", "text", "isHeader", "isFooter", "rowSpan", "columnSpan", "confidence"]]
                table_df.columns = [column_name[0].upper() + column_name[1:] for column_name in table_df.columns]
                                
                # add page number as first column            
                table_df["Page"] = page_number
                cols = table_df.columns.tolist()
                cols.insert(0, cols.pop(cols.index("Page")))
                table_df = table_df.reindex(columns=cols)

                # yield table
                yield table_df

# iterate through all relevant files in the test directory and submit each file to Form Recognizer
file_type_mapping = {
    "pdf": "application/pdf",
    "png": "application/png",
    "jpg": "application/jpeg"
}
for file_path in glob.iglob(local_test_files_directory + "/**/*", recursive=True):
    file_extension = os.path.splitext(file_path)[1][1:].lower()
    if file_extension in ["pdf", "png", "jpg"]:
        file_type = file_type_mapping[file_extension]

        extracted_data = analyze_form(form_recognizer_endpoint, form_recognizer_subscription_key,
            form_recognizer_model_id, file_path, file_type)
        
        print("File: {}".format(file_path))
        
        # remove comments to show full JSON response 
        #print("Response:")
        #print(json.dumps(extracted_data, indent=2))
        
        print("")
        print("Extracted Key-Value Pairs:")
        print(tabulate(extract_key_value_pairs(extracted_data), headers='keys', tablefmt='plain', showindex="never"))

        print("")
        print("Extracted Tables:")
        for extracted_table in extract_tables(extracted_data):
            print(tabulate(extracted_table, headers='keys', tablefmt='plain', showindex="never"))
            print("")

        print("")
        print("---")
        print("")

    else:
        continue

## Cleanup

In [None]:
# define function to delete a model
def delete_model(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id):
    # model URL
    model_url = form_recognizer_endpoint + "/formrecognizer/v2.0-preview/custom/models/" + form_recognizer_model_id
    headers = {
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key
    }

    try:
        delete_model_response = http_delete(url = model_url, headers = headers)
    except Exception as e:
        print(str(e))
        
def delete_all_models(form_recognizer_endpoint, form_recognizer_subscription_key):
    list_models_url = "{endpoint}/formrecognizer/v2.0-preview/custom/models".format(endpoint=form_recognizer_endpoint)
    headers = {
        'Ocp-Apim-Subscription-Key': form_recognizer_subscription_key
    }

    try:
        list_models_response = http_get(url = list_models_url, headers = headers)
        list_models_response_json = list_models_response.json()
        
        for model in list_models_response_json["modelList"]:
            delete_model(form_recognizer_endpoint, form_recognizer_subscription_key, model["modelId"])
            
    except Exception as e:
        print(str(e))
        
delete_model(form_recognizer_endpoint, form_recognizer_subscription_key, form_recognizer_model_id)

## alternatively, delete all models from the service
## uncomment if needed
#delete_all_models(form_recognizer_endpoint, form_recognizer_subscription_key)

In [None]:
# delete resource group in Azure
!az group delete --name $resource_group --yes