### Convert ECB Guide to json files

1. Use Adobe tools to convert pdf files to json format
2. Convert to JSON to Pandas format
3. Clean up (with automated methods the results are borderline acceptable)
4. Convert body of the text and heading to embeddings

In [1]:
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation

import os, logging, zipfile, glob, json
import pandas as pd
import utility_functions as uf
from openai import OpenAI
from tqdm import tqdm

# Settings
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

# Initialize logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', 
                    datefmt='%Y-%m-%d %H:%M:%S', 
                    level=logging.INFO)

In [2]:
def convert_pdf(folder, name):
    try:
        file_name = f'{folder}{name}'

        # Initial setup, create credentials instance.
        credentials = Credentials.service_principal_credentials_builder(). \
            with_client_id(os.getenv('PDF_SERVICES_CLIENT_ID')). \
            with_client_secret(os.getenv('PDF_SERVICES_CLIENT_SECRET')). \
            build()
    
        # Create an ExecutionContext using credentials and create a new operation instance.
        execution_context = ExecutionContext.create(credentials)
        extract_pdf_operation = ExtractPDFOperation.create_new()
    
        # Set operation input from a source file.
        source = FileRef.create_from_local_file(file_name)
        extract_pdf_operation.set_input(source)
    
        # Build ExtractPDF options and set them into the operation
        extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
            .with_element_to_extract(ExtractElementType.TEXT) \
            .build()
        extract_pdf_operation.set_options(extract_pdf_options)
    
        # Execute the operation.
        result: FileRef = extract_pdf_operation.execute(execution_context)
    
        # Save the result to the specified location.
        result.save_as(f"{name}.zip")
        
    except (ServiceApiException, ServiceUsageException, SdkException):
        logging.exception("Exception encountered while executing operation")

In [3]:
# Create docs
folder = f'C:\\projects\\ecb_guide_changes\\docs\\'

convert_pdf(folder, 'ssm.guidetointernalmodels_201910.pdf')
convert_pdf(folder, 'ssm.supervisory_guides_202402.pdf')

# Extract json file, rename it and place into docs folder
print('Rename the files and place them into docs folder')

2024-03-09 08:00:28 - INFO - All validations successfully done. Beginning ExtractPDF operation execution
2024-03-09 08:02:24 - INFO - Downloading file to C:\Users\Watson\AppData\Local\Temp\sdk_result\fb39ed0fdde211ee8f394c5262069064.zip
2024-03-09 08:02:24 - INFO - Extract Operation Successful - Transaction ID: 84923832-7fbc-42dc-b6d9-5f3ccae0834f
2024-03-09 08:02:24 - INFO - Moving file at C:\Users\Watson\AppData\Local\Temp\sdk_result\fb39ed0fdde211ee8f394c5262069064.zip to target ssm.guidetointernalmodels_201910.pdf.zip
2024-03-09 08:02:24 - INFO - All validations successfully done. Beginning ExtractPDF operation execution
2024-03-09 08:04:50 - INFO - Downloading file to C:\Users\Watson\AppData\Local\Temp\sdk_result\523f7431dde311eeaf044c5262069064.zip
2024-03-09 08:04:50 - INFO - Extract Operation Successful - Transaction ID: f4628cf2-5037-4152-8bec-64747b93032f
2024-03-09 08:04:50 - INFO - Moving file at C:\Users\Watson\AppData\Local\Temp\sdk_result\523f7431dde311eeaf044c5262069064

Rename the files and place them into docs folder


In [4]:
# Get list of all files with full path names
folder_path = 'C:\\projects\\ecb_guide_changes\\docs\\json_docs\\*'
files = glob.glob(folder_path)

# Create containers
full_dataset = pd.DataFrame()
clean_dataset = pd.DataFrame()

# Process all files
for file_name in files:
    logging.info(f"Processing file: {file_name}")

    # Process files
    with open(file_name, 'r') as file:
        data = json.load(file)

    # Preprocessing
    df = uf.create_elements(data)
    df2 = uf.initial_document(df)
    df3 = uf.create_merge_flag(df2)
    df4 = uf.create_merge_index(df3)

    # Add a new column with the document name
    full_name = os.path.basename(file_name)
    short_file_name, _ = os.path.splitext(full_name)
    df4['file_name'] = short_file_name
    cols = ['file_name'] + [col for col in df4.columns if col != 'file_name']
    df4 = df4[cols]
    df4 = uf.enrich_dataset(df4)

    # Clean columns
    df4['heading_1'] = df4['heading_1'].str.rstrip()
    df4['heading_2'] = df4['heading_2'].str.rstrip()
    df4['source'] = df4['source'].str.replace('\s+', ' ', regex=True)
    
    # Create final datasets
    df5 = uf.merge_rows(df4)
    df6 = uf.enrich_dataset(df5)

    # Save files
    full_dataset = pd.concat([full_dataset, df4], axis=0).reset_index(drop=True)
    clean_dataset = pd.concat([clean_dataset, df6], axis=0).reset_index(drop=True)

2024-03-09 08:04:50 - INFO - Processing file: C:\projects\ecb_guide_changes\docs\json_docs\2019_ecb_guide.json
2024-03-09 08:04:51 - INFO - Processing file: C:\projects\ecb_guide_changes\docs\json_docs\2024_ecb_guide.json


In [5]:
# Review the quality of the datasets
print(clean_dataset.groupby('file_name')['correct_format'].mean())
print(full_dataset.groupby('file_name')['correct_format'].mean())

file_name
2019_ecb_guide    0.684621
2024_ecb_guide    0.707921
Name: correct_format, dtype: float64
file_name
2019_ecb_guide    0.476695
2024_ecb_guide    0.494916
Name: correct_format, dtype: float64


In [6]:
# Get embeddings separately to headings and body of the text
client = OpenAI()
tqdm.pandas(desc="Processing rows")

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

clean_dataset['body_embedding'] = clean_dataset['body_of_the_text'].progress_apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

# Save files
clean_dataset.to_csv('ecb_guide_comparison.csv', index=False)

Processing rows: 100%|█████████████████████████████████████████████████████████████| 2695/2695 [12:45<00:00,  3.52it/s]


In [7]:
# Look at the dataset
clean_dataset.sample(2)

Unnamed: 0,file_name,heading_1,heading_2,heading_3,heading_4,heading_5,heading_6,list_label,body_of_the_text,is_footnote,page_number,path,merge_id,span_section,index_counter,correct_format,word_count,extracted_numbers,source,footnote_number,body_embedding
2369,2024_ecb_guide,Counterparty credit risk,2 Trade coverage,,,,,8.0,2.2 Principles for ECB Banking Supervision,0.0,219.0,//Document/P[275],//Document/P[275],0.0,0.0,False,6,[2],Counterparty credit risk > 2 Trade coverage > nan,,"[-0.006183973979204893, -0.007030556444078684, 0.10100677609443665, 0.001735213096253574, -0.026440273970365524, 0.07902928441762924, 0.021618681028485298, 0.024220097810029984, -0.0032657887786626816, 0.016797086223959923, 0.03864002600312233, -0.0532393641769886, -0.014038686640560627, 0.022302674129605293, 0.005261704325675964, -0.03231589123606682, 0.005065476521849632, 0.02034039795398712, 0.02883985824882984, -0.019600339233875275, 0.03866245225071907, -0.007798647508025169, 0.03570221..."
599,2019_ecb_guide,Market risk,2 Scope of the internal model approach,,,,,8.0,"In addition, for each category listed in paragraphs (<>)6 and (<>)7, institutions should be able to indicate to what extent the corresponding positions are included within the scope of the internal model approach (IMA).",0.0,120.0,//Document/L[106]/LI[7]/LBody,//Document/L[106]/LI[7],0.0,0.0,True,28,[],Market risk > 2 Scope of the internal model approach > nan,,"[-0.06623228639364243, 0.05208510905504227, 0.041707247495651245, 0.005360262468457222, 0.008768557570874691, 0.012238042429089546, 0.008334106765687466, -0.001278875395655632, 0.030790915712714195, 0.0049411095678806305, 0.005507118999958038, -0.024329228326678276, 0.010665453970432281, 0.010139218531548977, 0.04207438975572586, -0.0004195353831164539, 0.046847227960824966, 0.006675851996988058, 0.0024124241899698973, 0.10935914516448975, 0.018797634169459343, 0.017708446830511093, -0.01692..."
