In [1]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.storage.blob import BlobServiceClient
import PyPDF2
from tqdm import tqdm
from time import sleep
import yaml
import pandas as pd
from pandas import DataFrame as df


In [2]:
# Get relative paths from a given directory
def get_files(directory, extension):
    """
    Get a list of files in a directory with a given extension.

    Parameters:
    directory (str): The directory to search.
    extension (str): The file extension to search for.

    Returns:
    A list of file paths.
    """

    files = []
    for root, dirs, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith(extension):
                files.append(os.path.relpath(os.path.join(root, filename), directory))
    return files


In [3]:
def load_azure_config(directory="."):
    filename = "azure_config.yaml"
    filepath = directory + r"/" + filename
    # Load Azure Config from azure_config.yaml
    with open(filepath, "r") as stream:
        try:
            azure_config = yaml.safe_load(stream)
        except yaml.YAMLError as exc:   
            print(exc)
    return azure_config

In [150]:
# Connect to the Azure Document Intelligence API
def get_doc_client(AzureKeys):
    # Set the values for the Azure Document Intelligence API
    key = AzureKeys["DocumentIntelligence"]["KEY_1"]
    endpoint = AzureKeys["DocumentIntelligence"]["AZURE_ENDPOINT"]
    region = AzureKeys["DocumentIntelligence"]["AZURE_REGION"]

    # Create a client
    client = DocumentIntelligenceClient(endpoint, AzureKeyCredential(key))
    return client

# Connect to Azure Blob Storage
def get_blob_client(AzureKeys):
    # Set the values for Azure Blob Storage
    connection_string = AzureKeys["BlobStorage"]["CONNECTION_STRING"]
    container_name = AzureKeys["BlobStorage"]["CONTAINER_NAME"]

    # Create a BlobServiceClient
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)
    return container_client

In [5]:
# Upload document to blob storage
def upload_blob(blob_client:BlobServiceClient, file_path:str, blob_name:str):
    """
    Uploads a document to blob storage.

    Parameters:
    blob_client (BlobServiceClient): The blob service client.
    file_path (str): The path to the document to upload.
    blob_name (str): The name of the blob to create.

    Returns:
    The filename if upload fails.
    """

    with open(file_path, "rb") as data:
        blob_client.upload_blob(name=blob_name, data=data)
        
        # Test if the blob was uploaded
        blob = blob_client.get_blob_client(blob_name)

        if(blob.exists()):  
            print("Blob uploaded successfully.")
        else:
            print("Blob not uploaded successfully.")
            return blob_name


# Upload multiple documents to blob storage
def upload_multiple_blobs(blob_client:BlobServiceClient, file_paths:list, blob_names:list):
    """
    Uploads multiple documents to blob storage.

    Parameters:
        blob_client (BlobServiceClient): The blob service client.
        file_paths (list): A list of paths to the documents to upload.
        blob_names (list): A list of the names of the blobs to create.
    """
    for file_path, blob_name in tqdm(zip(file_paths, blob_names), total=len(file_paths), desc="Uploading files to Azure Blob Storage"):
        upload_blob(blob_client, file_path, blob_name)

def download_blob(blob_client, blob_name, file_path):
    """
    Downloads a document from blob storage.
        
    Parameters:
    blob_client (BlobServiceClient): The blob service client.
    blob_name (str): The name of the blob to download.
    file_path (str): The path to save the downloaded document.
    
    Returns:
    A list of all files not downloaded.      
        
    """

    with open(file_path, "wb") as data:
        blob_client.download_blob(blob_name).readinto(data)

In [6]:
# Function to open a PDF file and save each page as a separate PDF locally
def split_pdf_file(load_file_path, save_directory_path):
    """
    Splits a PDF file into separate pages and saves each page as a separate PDF file.
    
    Parameters:
    file_path (str): The path to the PDF file to split.
    """

    # Get the file name from the file path
    file_name = load_file_path.split(r"/")[-1].split(".")[0]

    # If save directory path does not exist, create it
    if not os.path.exists(save_directory_path):
        os.makedirs(save_directory_path)
    
    # Open the PDF file
    pdf_file = open(load_file_path, "rb")
    pdf = PyPDF2.PdfReader(pdf_file)
    
    # Save each page as a separate PDF file
    for page_num in range(len(pdf.pages)):
        pdf_writer = PyPDF2.PdfWriter()
        pdf_writer.add_page(pdf.pages[page_num])
        output_filename = f"{save_directory_path}/{file_name}_page_{page_num+1}.pdf"
        with open(output_filename, "wb") as output_pdf:
            # Write to output directory
            pdf_writer.write(output_pdf)
    
        # Close the output PDF file
        output_pdf.close()

        # Close the PDF writer
        pdf_writer.close()
        
        # Sleep for 0.5 second to avoid throttling
        sleep(0.5)

    # Close the PDF file
    pdf_file.close()

In [7]:
def split_pdf_files(file_paths: list, output_directory: str):
    # Split each PDF file
    for file in tqdm(file_paths, desc="Splitting PDF files...", unit="file"):
        sleep(0.5)
        split_pdf_file(file, output_directory)

In [8]:
def analyze_document(filepath, doc_client, AzureKeys):
    # Make sure your document's type is included in the list of document types the custom model can analyze
    with open(filepath, "rb") as f:
        poller = doc_client.begin_analyze_document(
            model_id=AzureKeys["DocumentIntelligence"]["MODEL_ID"], analyze_request=f, content_type="application/octet-stream"
        )
    doc_result: AnalyzeResult = poller.result()
    return doc_result

In [144]:
def load_taxonomy(taxonomy_dir="."):
    """
    Load the taxonomy from the taxonomy.yaml file.

    Returns:
    A dictionary containing the taxonomy.
    """

    taxonomy_filename = "taxonomy.yaml"
    taxonomy_filepath = taxonomy_dir + r"/" + taxonomy_filename

    # Load taxonomy.yaml to dictionary
    with open(taxonomy_filepath, "r") as stream:
        try:
            results = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    result_values = results['taxonomy']['fields']
    schema = results['taxonomy']['schema']['results']

    # Add the results schema to the result values
    for field_key in result_values.keys():
        result_values[field_key]['results'] = schema 
    
    return results


def process_doc_result(doc_result, taxonomy_dict):
    # print([key for key in doc_result['documents'][0]['fields'].keys()])

    for key in doc_result.documents[0].fields.keys():
        # Enter in doc model values
        try:
            data_type = doc_result['documents'][0]['fields'][key]['type']
            data_field = "value" + data_type[0].upper() + data_type[1:]
            taxonomy_dict['data']['taxonomy']['fields'][key]['results']['DOC_MODEL']['RAW_EXTRACTED_VALUE'] = \
                doc_result['documents'][0]['fields'][key][data_field]
                
            taxonomy_dict['data']['taxonomy']['fields'][key]['results']['DOC_MODEL']['CONFIDENCE'] = \
                doc_result['documents'][0]['fields'][key]['confidence']
        except KeyError as e:
            print("Key not detected: " + str(e) + " for key: " + key + " in doc model.")

    return taxonomy_dict

In [147]:
def ai_inference(file_paths, doc_client, AzureKeys):
    # Submit all files in a list to Azure Document Intelligence
    results = []
    for file in tqdm(file_paths, desc="Analyzing documents...", unit="file"):
        # load taxonomy to new dictionary
        filename = file.split(r"/")[-1]
        result = {'file': filename,
                  'data': load_taxonomy() }
        # Analyze the document
        doc_result = analyze_document(file, doc_client, AzureKeys)
        
        # Process the document result
        result = process_doc_result(doc_result, result)
        # Append the result to the results list
        results.append(result)

    return results

In [154]:
def main():
    # Load Azure Keys
    AzureKeys = load_azure_config()['AzureKeys']
    AzureKeys

    # Connect to Azure services
    doc_client = get_doc_client(AzureKeys)
    blob_client = get_blob_client(AzureKeys)


    # Load data
    data_path = "./term sheets/source"          # Source PDF directory
    file_names = get_files(data_path, ".pdf")   # Get all PDF files in the source data directory
    file_paths = [data_path + "/" + file for file in file_names] # Get the full paths to the PDF files


    # Initialize result as master data container
    results = {}

    # Perform AI Inference
    results = ai_inference(file_paths, doc_client, AzureKeys)

    results
 
        

    
    

# MAIN function 
if __name__ == "__main__":
    main()

Analyzing documents...:  14%|█▍        | 1/7 [00:07<00:42,  7.06s/file]

Key not detected: 'valueString' for key: ID_ISIN in doc model.
Key not detected: 'valueString' for key: CURRENCY in doc model.
Key not detected: 'valueString' for key: FREQUENCY in doc model.
Key not detected: 'valueString' for key: TRADE_DATE in doc model.
Key not detected: 'valueString' for key: INTEREST_TYPE in doc model.


Analyzing documents...:  29%|██▊       | 2/7 [00:14<00:37,  7.47s/file]

Key not detected: 'INTEREST_TYPE' for key: INTEREST_TYPE in doc model.
Key not detected: 'valueString' for key: ID_ISIN in doc model.
Key not detected: 'valueString' for key: CURRENCY in doc model.
Key not detected: 'valueString' for key: PERCENTAGE in doc model.
Key not detected: 'valueString' for key: FREQUENCY in doc model.
Key not detected: 'valueString' for key: TRADE_DATE in doc model.


Analyzing documents...:  43%|████▎     | 3/7 [00:24<00:33,  8.43s/file]

Key not detected: 'INTEREST_TYPE' for key: INTEREST_TYPE in doc model.
Key not detected: 'valueString' for key: ID_ISIN in doc model.
Key not detected: 'valueString' for key: CURRENCY in doc model.
Key not detected: 'valueString' for key: LISTING in doc model.
Key not detected: 'valueString' for key: PERCENTAGE in doc model.
Key not detected: 'valueString' for key: FREQUENCY in doc model.
Key not detected: 'valueString' for key: TRADE_DATE in doc model.


Analyzing documents...:  57%|█████▋    | 4/7 [00:31<00:23,  7.96s/file]

Key not detected: 'valueString' for key: ID_ISIN in doc model.
Key not detected: 'valueString' for key: LISTING in doc model.
Key not detected: 'valueString' for key: PERCENTAGE in doc model.
Key not detected: 'valueString' for key: FREQUENCY in doc model.
Key not detected: 'valueString' for key: TRADE_DATE in doc model.
Key not detected: 'valueString' for key: INTEREST_TYPE in doc model.


Analyzing documents...:  71%|███████▏  | 5/7 [01:01<00:31, 15.97s/file]

Key not detected: 'INTEREST_TYPE' for key: INTEREST_TYPE in doc model.
Key not detected: 'valueString' for key: ID_ISIN in doc model.
Key not detected: 'valueString' for key: LISTING in doc model.
Key not detected: 'valueString' for key: PERCENTAGE in doc model.
Key not detected: 'valueString' for key: FREQUENCY in doc model.
Key not detected: 'valueString' for key: TRADE_DATE in doc model.
Key not detected: 'valueDate' for key: MATURITY_DATE in doc model.


: 

In [None]:
results

In [151]:
sampledoc = [data_path + "/" + file for file in file_names][0]
sampledoc

'./term sheets/source/BBVA-Final-Terms-Series-164-Execution-version-1.pdf'

In [152]:
# Load Azure Keys
AzureKeys = load_azure_config()['AzureKeys']
AzureKeys

# Connect to Azure services
doc_client = get_doc_client(AzureKeys)
blob_client = get_blob_client(AzureKeys)


# Load data
data_path = "./term sheets/source"          # Source PDF directory
file_paths = get_files(data_path, ".pdf")   # Get all PDF files in the source data directory
file_names = [file.split(r"/")[-1] for file in file_paths]

# Initialize result as master data container
results = {}

data_path = "./term sheets/source"
filenames = get_files(data_path, ".pdf")
filepaths = [data_path + "/" + file for file in filenames]


filepaths

sample_paths = [filepaths[0]]
sample_paths
 
        

['./term sheets/source/BBVA-Final-Terms-Series-164-Execution-version-1.pdf']