**Raw text extraction from PDFs with Document AI from Google**
Used for ADs

In [2]:
import os
import json
from google.cloud import documentai
from google.api_core.client_options import ClientOptions
from tqdm import tqdm  # Import tqdm

# --- Configuration ---
project_id = "mthesis-450913"  # Your Google Cloud Project ID
location = "us"  # Processor Location (e.g., "us" or "eu")
processor_id = "9aed4672acfd8a7a"  # Your Document AI Processor ID
mime_type = "application/pdf"

# --- Define Input and Output Directories ---
# IMPORTANT: Replace these with the actual paths to your directories
input_directory = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\llm_with_guidance\test_model_extraction"  # Directory containing your PDF files
output_directory = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\llm_with_guidance\test_model_extraction" # Directory where JSON files will be saved

# Ensure output directory exists, create if it doesn't
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    print(f"Created output directory: ")

def process_document_ocr_to_text(project_id: str, location: str, processor_id: str, file_path: str, mime_type: str) -> str | None:
    """
    Processes a single document using Document AI OCR and returns the extracted text.
    """
    try:
        # You must set the api_endpoint if you use a location other than "us".
        opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")  # Use f-string for endpoint
        client = documentai.DocumentProcessorServiceClient(client_options=opts)

        name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"  # Correctly format resource name

        with open(file_path, "rb") as f:
            image_content = f.read()

        raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
        request = documentai.ProcessRequest(name=name, raw_document=raw_document)

        result = client.process_document(request=request)
        #print(f"Successfully processed: {os.path.basename(file_path)}")
        return result.document.text
    except Exception as e:
        print(f"Error processing document {os.path.basename(file_path)}: {e}") # Include error message
        return None

def main():
    processed_files = 0
    failed_files = 0

    pdf_files = [filename for filename in os.listdir(input_directory) if filename.lower().endswith(".pdf")]
    total_files = len(pdf_files)

    # Use tqdm to create a progress bar
    with tqdm(total=total_files, desc="Processing PDF Files") as pbar:
        for filename in pdf_files:  # Iterate directly through pdf_files list
            pdf_file_path = os.path.join(input_directory, filename)
            
            #print(f"\nProcessing: ...")
            extracted_text = process_document_ocr_to_text(
                project_id, location, processor_id, pdf_file_path, mime_type
            )

            if extracted_text:
                # Create a dictionary for the JSON content
                json_data = {"text": extracted_text}
                
                # Construct the output JSON filename
                base_filename = os.path.splitext(filename)[0]
                json_filename = f"{base_filename}.json"  # Use base_filename for JSON path
                json_file_path = os.path.join(output_directory, json_filename)

                try:
                    with open(json_file_path, "w", encoding="utf-8") as json_file:
                        json.dump(json_data, json_file, ensure_ascii=False, indent=4)
                    #print(f"Successfully saved extracted text to: ")
                    processed_files += 1
                except IOError as e:
                    print(f"Error writing JSON file {json_filename}: {e}")  # Include filename in error
                    failed_files += 1
            else:
                #print(f"Failed to extract text from: ")
                failed_files += 1
            # Update the progress bar
            pbar.update(1)

            if failed_files > 5:
             break
        #else:
           # print(f"Skipping non-PDF file: ")

    print(f"\n--- Processing Complete ---")
    print(f"Successfully processed and saved: {processed_files} files.")
    print(f"Failed to process or save: {failed_files} files.")


if __name__ == "__main__":
    main()

Processing PDF Files: 100%|██████████| 3/3 [00:21<00:00,  7.21s/it]


--- Processing Complete ---
Successfully processed and saved: 3 files.
Failed to process or save: 0 files.





**Added time tracking**

In [14]:
import os
import json
import time  # Import the time module
from google.cloud import documentai
from google.api_core.client_options import ClientOptions
from tqdm import tqdm  # Import tqdm

# --- Configuration ---
project_id = "mthesis-450913"  # Your Google Cloud Project ID
location = "us"  # Processor Location (e.g., "us" or "eu")
processor_id = "9aed4672acfd8a7a"  # Your Document AI Processor ID
mime_type = "application/pdf"

# --- Define Input and Output Directories ---
# IMPORTANT: Replace these with the actual paths to your directories
input_directory = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\llm_with_guidance\test_model_extraction"  # Directory containing your PDF files
output_directory = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\llm_with_guidance\test_model_extraction" # Directory where JSON files will be saved

# Ensure output directory exists, create if it doesn't
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    print(f"Created output directory: {output_directory}")

def process_document_ocr_to_text(project_id: str, location: str, processor_id: str, file_path: str, mime_type: str) -> dict | None:
    """
    Processes a single document using Document AI OCR and returns a dictionary
    containing the extracted text and the processing time.
    """
    try:
        start_time = time.time()  # Start timing the processing

        # You must set the api_endpoint if you use a location other than "us".
        opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")  # Use f-string for endpoint
        client = documentai.DocumentProcessorServiceClient(client_options=opts)

        name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"  # Correctly format resource name

        with open(file_path, "rb") as f:
            image_content = f.read()

        raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
        request = documentai.ProcessRequest(name=name, raw_document=raw_document)

        result = client.process_document(request=request)
        end_time = time.time()  # Stop timing the processing
        processing_time = end_time - start_time
        processing_time = round(processing_time, 2)

        #print(f"Successfully processed: {os.path.basename(file_path)}")
        return {
            "text": result.document.text,
            "ocr_processing_time": processing_time
        }
    except Exception as e:
        print(f"Error processing document {os.path.basename(file_path)}: {e}")  # Include error message
        return None

def main():
    processed_files = 0
    failed_files = 0

    pdf_files = [filename for filename in os.listdir(input_directory) if filename.lower().endswith(".pdf")]
    total_files = len(pdf_files)

    # Use tqdm to create a progress bar
    with tqdm(total=total_files, desc="Processing PDF Files") as pbar:
        for filename in pdf_files:  # Iterate directly through pdf_files list
            pdf_file_path = os.path.join(input_directory, filename)

            #print(f"\nProcessing: {filename}...")
            processing_result = process_document_ocr_to_text(
                project_id, location, processor_id, pdf_file_path, mime_type
            )

            if processing_result:
                # Construct the output JSON filename
                base_filename = os.path.splitext(filename)[0]
                json_filename = f"{base_filename}_ocr.json"
                json_file_path = os.path.join(output_directory, json_filename)

                try:
                    with open(json_file_path, "w", encoding="utf-8") as json_file:
                        json.dump(processing_result, json_file, ensure_ascii=False, indent=4)
                    #print(f"Successfully saved extracted text and processing time to: {json_filename}")
                    processed_files += 1
                except IOError as e:
                    print(f"Error writing JSON file for {filename}: {e}")  # Include filename in error
                    failed_files += 1
            else:
                #print(f"Failed to extract text from: {filename}")
                failed_files += 1
            # Update the progress bar
            pbar.update(1)

            if failed_files > 5:
                print("Too many failures, stopping.")
                break
        #else:
           # print(f"Skipping non-PDF file: ") # This part of the original code was a comment and not relevant here.

    print(f"\n--- Processing Complete ---")
    print(f"Successfully processed and saved: {processed_files} files.")
    print(f"Failed to process or save: {failed_files} files.")


if __name__ == "__main__":
    main()

Processing PDF Files: 100%|██████████| 2/2 [00:14<00:00,  7.09s/it]


--- Processing Complete ---
Successfully processed and saved: 2 files.
Failed to process or save: 0 files.





**Data Structuring from the raw text**

In [21]:
# === CONFIGURATION ===
# This directory should now contain your JSON files with the raw text.
INPUT_DIR = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\llm_with_guidance\test_model_extraction\output"
# INPUT_DIR = r"C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\Sample_of_a_sample"
OUTPUT_DIR = os.path.join(INPUT_DIR, "output_structured")

prompt_a = """

            You are an expert aviation maintenance specialist. Your task is to meticulously analyze raw text from an Airworthiness Directive (AD) and extract key information into a structured JSON format based on the provided schema.
            
            ### INSTRUCTIONS
            1.  **Think Step-by-Step:** First, perform a step-by-step analysis of the provided AD text. In your reasoning, identify and quote the specific text snippets that correspond to each field in the schema (`ad_number`, `status`, `affected_parts`, `applicability_groups`, etc.).
            2.  **Extract and Populate:** Based on your analysis, extract the relevant information and fill out the JSON schema.
            3.  **Strict Grounding:** Only use information explicitly stated in the provided AD text. Do not infer information or use any external knowledge.
            4.  **Handle Missing Information:**
                *   If information for a simple field (e.g., `identifier`, `location`) is not found, use the JSON value `null`.
                *   If information for a list field (e.g., `affected_parts`, `qualifiers`) is not found, use an empty list `[]`.
            5.  **Final Output:** Your final output must be **only the valid JSON object** and nothing else. Do not include your step-by-step thinking in the final output.
            
            ### EXAMPLES
            {
                "ad_number": "2019-0106",
                "status": "superseded",
                "ata_number": "ATA 25",
                "affected_parts": [
                  {
                    "raw_text": "Galleys, having a part number listed as “Old Part N°” in the applicable SB",   
                     "identifier": null
                    }
                ],
                "applicability_groups": [
                  {
                    "manufacturer": "Airbus, formerly Airbus Industrie",
                    "models": [
                      "A319-112", "A319-115", "A319-132", "A320-214", "A320-216", "A320-232", "A320-233", "A320-251N", "A320-271N", "A321-211", "A321-231", "A321-232", "A321-251N", "A321-253N"
                    ],
                    "serial_numbers": "all manufacturer serial numbers (MSN) as listed in the applicable SB (Airbus Service Bulletin (SB) A320-25-1BHG, SB A320-25-1BK9 Revision 01, or SB A320-25-1BKK, as applicable to aeroplane MSN).",
                    "qualifiers": []
                  }
                ]
            }
            {
                "ad_number": "2020-0040R1",
                "status": "active",
                "ata_number": "ATA 53",
                "affected_parts": [
                  {
                    "raw_text": "Overwing Emergency Exit Cut-Outs in Section 15",   
                     "identifier": null
                    }
                ],
                "applicability_groups": [
                  {
                    "manufacturer": "Airbus (formerly Airbus Industrie)",
                    "models": [
                      "A319-111", "A319-112", "A319-113", "A319-114", "A319-115", "A319-131", "A319-132", "A319-133", "A320-211", "A320-212", "A320-214", "A320-215", "A320-216", "A320-231", "A320-232", "A320-233"
                    ],
                    "serial_numbers": "all manufacturer serial numbers (MSN)",
                    "qualifiers": [
                      {
                        "type": "exclusion",
                        "raw_text": "A319 and A320 aeroplanes on which Airbus modification (mod) 160001 was embodied in production;",
                        "condition": { "identifiers": ["160001"], "requirement": "mod embodied", "applies_to_subset_models": ["A319", "A320"] }
                      },
                      {
                        "type": "exclusion",
                        "raw_text": "A319 and A320 aeroplanes on which Airbus Service Bulletin (SB) A320-57-1193 was embodied in service; and",
                        "condition": { "identifiers": ["A320-57-1193"], "requirement": "SB embodied", "applies_to_subset_models": ["A319", "A320"] }
                      },
                      {
                        "type": "exclusion",
                        "raw_text": "A319 aeroplanes on which Airbus mod 28238, mod 28162 and mod 28342 were embodied in production.",
                        "condition": { "identifiers": ["28238", "28162", "28342"], "requirement": "mod embodied", "applies_to_subset_models": ["A319"] }
                      }
                    ]
                  }
                ]
            }
            {
                "ad_number": "2020-0250",
                "status": "active",
                "ata_number": "ATA 71",
                "affected_parts": [
                  {
                    "raw_text": "Affected part: Forward (fwd) engine mount shackle assemblies, having Part Number (P/N) D7121513500xxx, where ‘xxx’ can be any numerical value. Serviceable part: Any fwd engine mount shackle assembly which is not an affected part, including those having P/N D7121515000xxx, where ‘xxx’ can be any numerical value. Affected engine mount: Fwd engine mount assemblies, having P/N D7121506500xxx (fitted with an affected part), where ‘xxx’ can be any numerical value. Serviceable engine mount: Any fwd engine mount assembly which is not an affected engine mount, including those having P/N D7121514900xxx (fitted with a serviceable part), where ‘xxx’ can be any numerical value.",   
                     "identifier": ["D7121513500xxx", "D7121515000xxx", "D7121506500xxx", " D7121514900xxx"
                    ]
                    }
                ],
                "applicability_groups": [
                  {
                    "manufacturer": "Airbus, formerly Airbus Industrie",
                    "models": [
                      "A319-171N",
                      "A320-271N",
                      "A320-272N",
                      "A320-273N",
                      "A321-271N",
                      "A321-272N",
                      "A321-271NX",
                      "A321-272NX"
                    ],
                    "serial_numbers": "all manufacturer serial numbers",
                    "qualifiers": [
                      {
                        "type": "condition",
                        "raw_text": "Group 1 aeroplanes are those that have an affected engine mount installed (Fwd engine mount assemblies, having P/N D7121506500xxx (fitted with an affected part), where ‘xxx’ can be any numerical value).",
                        "condition": {
                          "identifiers": [
                            "D7121506500xxx"
                          ],
                          "requirement": "affected engine mount installed",
                          "applies_to_subset_models": "Group 1"
                        }
                      },
                      {
                        "type": "condition",
                        "raw_text": "Group 2 aeroplanes are those that do not have an affected engine mount installed. An aeroplane having embodied Airbus modification (mod) 163278 in production is a Group 2 aeroplane, provided the fwd engine mounts and shackle assemblies have not been replaced since aeroplane date of manufacture. ",
                        "condition": {
                          "identifiers": [
                            "163278"
                          ],
                          "requirement": "mod embodied",
                          "applies_to_subset_models": "Group 2"
                        }
                      }
                    ]
                  }
                ]
              }
              {
                "ad_number": "2021-0002R1",
                "status": "active",
                "ata_number": "ATA 55",
                "affected_parts": [
                  {
                    "raw_text": "Affected part: Any rudder which has been modified in accordance with the instructions of the SB (A320-55-1052), and re-identified with a Part Number (P/N) as listed in Appendix 1 of this AD, except those which have passed (no defects found) a Special Detailed Inspection (SDI) in accordance with the instructions of the AOT, as defined in this AD, or have been repaired in accordance with the instructions for permanent repair of the AOT. Serviceable part: Any rudder, eligible for installation, which is not an affected part. This includes rudders which have been modified in accordance with the instructions of Airbus SB A320-55-1052 Revision 03 or later, or Airbus SB A320-55-1059 Revision 01 or later, as applicable.",   
                     "identifier": ["A320-55-1052", "A320-55-1052", "A320-55-1059"
                    ]
                    }
                ],
                "applicability_groups": [
                  {
                    "manufacturer": "Airbus, formerly Airbus Industrie",
                    "models": [
                      "A318-111",
                      "A318-112",
                      "A318-121",
                      "A318-122",
                      "A319-111",
                      "A319-112",
                      "A319-113",
                      "A319-114",
                      "A319-115",
                      "A319-131",
                      "A319-132",
                      "A319-133",
                      "A319-151N",
                      "A319-153N",
                      "A319-171N",
                      "A320-211",
                      "A320-212",
                      "A320-214",
                      "A320-215",
                      "A320-216",
                      "A320-231",
                      "A320-232",
                      "A320-233",
                      "A320-251N",
                      "A320-252N",
                      "A320-253N",
                      "A320-271N",
                      "A320-272N",
                      "A320-273N",
                      "A321-111",
                      "A321-112",
                      "A321-131",
                      "A321-211",
                      "A321-212",
                      "A321-213",
                      "A321-231",
                      "A321-232",
                      "A321-251N",
                      "A321-252N",
                      "A321-253N",
                      "A321-271N",
                      "A321-272N",
                      "A321-251NX",
                      "A321-252NX",
                      "A321-253NX",
                      "A321-271NX",
                      "A321-272NX"
                    ],
                    "serial_numbers": "all manufacturer serial numbers",
                    "qualifiers": [
                      {
                        "type": "condition",
                        "raw_text": " Group 1 aeroplanes are those that have an affected part installed. Aeroplanes on which the SB has been embodied are Group 1.",
                        "condition": {
                          "identifiers": null,
                          "requirement": "affected part installed",
                          "applies_to_subset_models": "Group 1"
                        }
                      },
                      {
                        "type": "condition",
                        "raw_text": "Group 2 aeroplanes are those that do not have an affected part installed. An aeroplane on which Airbus modification 156859 has been embodied in production is Group 2, provided it is determined that no affected part is installed on that aeroplane.",
                        "condition": {
                          "identifiers": [
                            "156859"
                          ],
                          "requirement": "mod embodied",
                          "applies_to_subset_models": "Group 2"
                        }
                      }
                    ]
                  }
                ]
              }
            {
                "ad_number": "2022-0147",
                "status": "superseded",
                "ata_number": "ATA 36",
                "affected_parts": [
                  {
                    "raw_text": " Overheat detection system (OHDS) sensing elements, also identified as ‘Continuous Fire Detector’, having a Part Number (P/N) and corresponding date code as listed in Section 1.A of the VSB, except those that passed an inspection (no discrepancies found; one face of the connector hex nut is marked) in accordance with the instructions of Section 3 of the VSB.; Affected position: Positions identified as Functional Item Number (FIN) 34HF, FIN 35HF, FIN 61HF and FIN 6 HF. ",   
                     "identifier": ["34HF", "35HF", "61HF", "62HF"]
                    }
                ],
                "applicability_groups": [
                  {
                    "manufacturer": "Airbus, formerly Airbus Industrie",
                    "models": [
                      "A318-111",
                      "A318-112",
                      "A318-121",
                      "A318-122",
                      "A319-111",
                      "A319-112",
                      "A319-113",
                      "A319-114",
                      "A319-115",
                      "A319-131",
                      "A319-132",
                      "A319-133",
                      "A319-151N",
                      "A319-153N",
                      "A319-171N",
                      "A320-211",
                      "A320-212",
                      "A320-214",
                      "A320-215",
                      "A320-216",
                      "A320-231",
                      "A320-232",
                      "A320-233",
                      "A320-251N",
                      "A320-252N",
                      "A320-253N",
                      "A320-271N",
                      "A320-272N",
                      "A320-273N",
                      "A321-111",
                      "A321-112",
                      "A321-131",
                      "A321-211",
                      "A321-212",
                      "A321-213",
                      "A321-231",
                      "A321-232",
                      "A321-251N",
                      "A321-251NX",
                      "A321-252N",
                      "A321-252NX",
                      "A321-253N",
                      "A321-253NX",
                      "A321-271N",
                      "A321-271NX",
                      "A321-272N",
                      "A321-272NX"
                    ],
                    "serial_numbers": "all manufacturer serial numbers (MSN)",
                    "qualifiers": [
                      {
                        "type": "condition",
                        "raw_text": "Group 1 aeroplanes are those that have an affected part installed at an affected position.",
                        "condition": {
                          "identifiers": null,
                          "requirement": "affected part at affected position",
                          "applies_to_subset_models": "Group 1"
                        }
                      },
                      {
                        "type": "condition",
                        "raw_text": "Group 2 aeroplanes are those that do not have an affected part installed at any affected position. An aeroplane having an MSN not listed in Section 1.A of the SB is Group 2, provided it is determined that no affected part has been installed on any affected position of that aeroplane since the aeroplane date of manufacture.",
                        "condition": {
                          "identifiers": null,
                          "requirement": "affected part not installed at affected position",
                          "applies_to_subset_models": "Group 2"
                        }
                      }
                    ]
                  }
                ]
              }
            """

In [12]:
import os
import json
from google import genai
from google.genai import types


os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate():
    """
    Processes JSON files containing raw text, extracts structured data using the Gemini API,
    and saves the results as new JSON files.
    """
    client = genai.Client(
        vertexai=True,
        project="mthesis-450913",
        location="us-central1",
    )

    for filename in os.listdir(INPUT_DIR):
        # Process only .json files
        if not filename.lower().endswith(".json"):
            continue

        input_path = os.path.join(INPUT_DIR, filename)
        print(f"\nProcessing: {filename}")

        # Load the JSON file and extract the raw text from the "text" key
        try:
            with open(input_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                raw_text = data.get("text") # Get text from the JSON object
                if not raw_text:
                    print(f"⚠️  Skipping {filename}: No 'text' key found or content is empty.")
                    continue
        except (json.JSONDecodeError, IOError) as e:
            print(f"❌ Error reading or parsing {filename}: {e}")
            continue

        # Instruction for the model
        instruction = types.Part.from_text(text=prompt_a)
        
        # Create the document part from the extracted raw text
        document = types.Part.from_text(text=raw_text)

        contents = [
            types.Content(
                role="user",
                parts=[instruction, document]
            ),
        ]

        # Define the generation configuration and the desired JSON output schema
        generate_content_config = types.GenerateContentConfig(
            temperature=0.3,
            top_p=0.95,
            max_output_tokens=2048,
            response_modalities=["TEXT"],
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
            ],
            response_mime_type="application/json",
            response_schema={
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "ad_number": {
                    "type": "string",
                    "description": "EASA AD Number (e.g., 2019-0056)"
                  },
                  "status": {
                    "type": "string",
                    "description": "Status of an AD (superseded, active). If not superseded then active.",
                    "enum": ["superseded", "active"]
                  },
                  "ata_number": {
                    "type": "string",
                    "description": "ATA number figuring in the AD (e.g ATA 32)"
                  },
                  "affected_parts": {
                    "type": "array",
                    "items": {
                      "type": "object",
                      "properties": {
                        "raw_text": {
                          "type": "string",
                          "description": "Direct quote or phrasing from the AD that describes the affected part"
                        },
                        "identifier": {
                          "type": "string",
                          "description": "Numbers or identifiers of an affected part (e.g D7121513500xxx or E21327307)"
                        },
                      },
                      "required": [
                        "raw_text",
                        "identifier"
                      ]
                    }
                  },
                  "applicability_groups": {
                    "type": "array",
                    "items": {
                      "type": "object",
                      "properties": {
                        "manufacturer": {
                          "type": "string",
                          "description": "Manufacturer name (e.g., Airbus)"
                        },
                        "models": {
                          "type": "array",
                          "items": {
                            "type": "string",
                            "description": "Specific aircraft model variant (e.g., A318-111, A319-112, A320-251N)"
                          }
                        },
                        "serial_numbers": {
                          "type": "string",
                          "description": "Text describing serial number applicability (e.g., all manufacturer serial numbers, all MSN, all manufacturer serial numbers (MSN) as listed in...)"
                        },
                        "qualifiers": {
                          "type": "array",
                          "items": {
                            "type": "object",
                            "properties": {
                              "type": {
                                "type": "string",
                                "description": "Type of qualifier (e.g., exclusion, condition)",
                                "enum": ["exclusion", "condition"]
                              },
                              "raw_text": {
                                "type": "string",
                                "description": "The exact text of the qualifier clause (e.g., except those on which Airbus modification (mod) 161306 has been embodied in production or Group 1 aeroplanes are those on which an affected part was replaced with a non-affected part)"
                              },
                              "condition": {
                                "type": "object",
                                "properties": {
                                  "identifiers": {
                                    "type": "array",
                                    "items": {
                                      "type": "string",
                                      "description": "Array of mod numbers, SB numbers, configuration names, etc."
                                    },
                                    "nullable": True
                                  },
                                  "requirement": {
                                    "type": "string",
                                    "description": "Describes the requirement (e.g., must be embodied, must not be embodied, that do not have, as listed in Table X, was embodied)",
                                    "nullable": True
                                  },
                                  "applies_to_subset_models": {
                                    "type": "array",
                                    "items": {
                                      "type": "string",
                                      "description": "If the qualifier applies only to a subset of models listed in the parent models array (e.g., A319, Group 2), list them here. Use null if it applies to all models in the group.",
                                      "nullable": True
                                    },
                                     "nullable": True
                                  }
                                },
                                "nullable": True
                              }
                            },
                            "required": [
                              "type",
                              "raw_text"
                            ]
                          }
                        }
                      },
                      "required": [
                        "manufacturer",
                        "models",
                        "serial_numbers"
                      ]
                    }
                  }
                },
                "required": [
                  "ad_number",
                  "status",
                  "ata_number",
                  "affected_parts",
                  "applicability_groups"
                ],
                "propertyOrdering": [
                  "ad_number",
                  "status",
                  "ata_number",
                  "affected_parts",
                  "applicability_groups"
                ]
              }
            }
                
        )

        # Call Gemini and collect the streaming response
        result_text = ""
        try:
            for chunk in client.models.generate_content_stream(
                model="gemini-2.0-flash-lite-001",
                contents=contents,
                config=generate_content_config,
            ):
                result_text += chunk.text
        except Exception as e:
            print(f"❌ An error occurred during API call for {filename}: {e}")
            continue

        # Parse the JSON response and save it to a file
        try:
            parsed = json.loads(result_text)
            output_filename = f"{os.path.splitext(filename)[0]}_extracted1.json"
            output_path = os.path.join(OUTPUT_DIR, output_filename)
            with open(output_path, "w", encoding="utf-8") as out_file:
                json.dump(parsed, out_file, indent=2)
            print(f"✅ Saved to {output_path}")
        except json.JSONDecodeError:
            print(f"❌ Failed to parse JSON response for {filename}")
            print("--- Model Output ---")
            print(result_text)
            print("--------------------")

if __name__ == "__main__":
    generate()


Processing: AD_2019-0056_1.json
✅ Saved to C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\llm_with_guidance\test_model_extraction\output_structured\AD_2019-0056_1_extracted1.json

Processing: AD_2020-0148_1.json
✅ Saved to C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\llm_with_guidance\test_model_extraction\output_structured\AD_2020-0148_1_extracted1.json


**Added progress bar and time tracking**

In [22]:
import os
import json
import time  # Import the time module
from google import genai
from google.genai import types
from tqdm import tqdm  # Import tqdm


# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate():
    """
    Processes JSON files containing raw text, extracts structured data using the Gemini API,
    and saves the results as new JSON files, including processing time.
    """
    client = genai.Client(
        vertexai=True,
        project="mthesis-450913",
        location="us-central1",
    )

    json_files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".json")]
    total_files = len(json_files)

    # Use tqdm to create a progress bar
    with tqdm(total=total_files, desc="Processing JSON Files") as pbar:
        for filename in json_files:
            input_path = os.path.join(INPUT_DIR, filename)
            #print(f"\nProcessing: {filename}")

            # Load the JSON file and extract the raw text from the "text" key
            raw_text = None
            try:
                with open(input_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    raw_text = data.get("text")  # Get text from the JSON object
                    if not raw_text:
                        print(f"⚠️  Skipping {filename}: No 'text' key found or content is empty.")
                        pbar.update(1)
                        continue
            except (json.JSONDecodeError, IOError) as e:
                print(f"❌ Error reading or parsing {filename}: {e}")
                pbar.update(1)
                continue

            # Instruction for the model
            instruction = types.Part.from_text(text=prompt_a)

            # Create the document part from the extracted raw text
            document = types.Part.from_text(text=raw_text)

            contents = [
                types.Content(
                    role="user",
                    parts=[instruction, document]
                ),
            ]

            # Define the generation configuration and the desired JSON output schema
            generate_content_config = types.GenerateContentConfig(
                temperature=0.3,
                top_p=0.95,
                max_output_tokens=2048,
                response_modalities=["TEXT"],
                safety_settings=[
                    types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                    types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                    types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                    types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
                ],
                response_mime_type="application/json",
                response_schema={
                  "type": "array",
                  "items": {
                    "type": "object",
                    "properties": {
                      "ad_number": {
                        "type": "string",
                        "description": "EASA AD Number (e.g., 2019-0056)"
                      },
                      "status": {
                        "type": "string",
                        "description": "Status of an AD (superseded, active). If not superseded then active.",
                        "enum": ["superseded", "active"]
                      },
                      "ata_number": {
                        "type": "string",
                        "description": "ATA number figuring in the AD (e.g ATA 32)"
                      },
                      "affected_parts": {
                        "type": "array",
                        "items": {
                          "type": "object",
                          "properties": {
                            "raw_text": {
                              "type": "string",
                              "description": "Direct quote or phrasing from the AD that describes the affected part"
                            },
                            "identifier": {
                              "type": "string",
                              "description": "Numbers or identifiers of an affected part (e.g D7121513500xxx or E21327307)"
                            },
                          },
                          "required": [
                            "raw_text",
                            "identifier"
                          ]
                        }
                      },
                      "applicability_groups": {
                        "type": "array",
                        "items": {
                          "type": "object",
                          "properties": {
                            "manufacturer": {
                              "type": "string",
                              "description": "Manufacturer name (e.g., Airbus)"
                            },
                            "models": {
                              "type": "array",
                              "items": {
                                "type": "string",
                                "description": "Specific aircraft model variant (e.g., A318-111, A319-112, A320-251N)"
                              }
                            },
                            "serial_numbers": {
                              "type": "string",
                              "description": "Text describing serial number applicability (e.g., all manufacturer serial numbers, all MSN, all manufacturer serial numbers (MSN) as listed in...)"
                            },
                            "qualifiers": {
                              "type": "array",
                              "items": {
                                "type": "object",
                                "properties": {
                                  "type": {
                                    "type": "string",
                                    "description": "Type of qualifier (e.g., exclusion, condition)",
                                    "enum": ["exclusion", "condition"]
                                  },
                                  "raw_text": {
                                    "type": "string",
                                    "description": "The exact text of the qualifier clause (e.g., except those on which Airbus modification (mod) 161306 has been embodied in production or Group 1 aeroplanes are those on which an affected part was replaced with a non-affected part)"
                                  },
                                  "condition": {
                                    "type": "object",
                                    "properties": {
                                      "identifiers": {
                                        "type": "array",
                                        "items": {
                                          "type": "string",
                                          "description": "Array of mod numbers, SB numbers, configuration names, etc."
                                        },
                                        "nullable": True
                                      },
                                      "requirement": {
                                        "type": "string",
                                        "description": "Describes the requirement (e.g., must be embodied, must not be embodied, that do not have, as listed in Table X, was embodied)",
                                        "nullable": True
                                      },
                                      "applies_to_subset_models": {
                                        "type": "array",
                                        "items": {
                                          "type": "string",
                                          "description": "If the qualifier applies only to a subset of models listed in the parent models array (e.g., A319, Group 2), list them here. Use null if it applies to all models in the group.",
                                          "nullable": True
                                        },
                                         "nullable": True
                                      }
                                    },
                                    "nullable": True
                                  }
                                },
                                "required": [
                                  "type",
                                  "raw_text"
                                ]
                              }
                            }
                          },
                          "required": [
                            "manufacturer",
                            "models",
                            "serial_numbers"
                          ]
                        }
                      }
                    },
                    "required": [
                      "ad_number",
                      "status",
                      "ata_number",
                      "affected_parts",
                      "applicability_groups"
                    ],
                    "propertyOrdering": [
                      "ad_number",
                      "status",
                      "ata_number",
                      "affected_parts",
                      "applicability_groups"
                    ]
                  }
                }
            )

            # Call Gemini and collect the streaming response, timing the operation
            result_text = ""
            start_time = time.time()  # Start timing the extraction
            try:
                for chunk in client.models.generate_content_stream(
                    model="gemini-2.0-flash-lite-001",
                    contents=contents,
                    config=generate_content_config,
                ):
                    result_text += chunk.text
            except Exception as e:
                print(f"❌ An error occurred during API call for {filename}: {e}")
                pbar.update(1)
                continue
            end_time = time.time()  # Stop timing the extraction
            structure_processing_time = end_time - start_time
            structure_processing_time = round(structure_processing_time, 2)

            # Parse the JSON response and save it to a file
            try:
                parsed = json.loads(result_text)

                # Add the processing time to the parsed data
                # We'll add it to each object in the array, assuming the output is an array of ADs
                # If the output can be a single object, you'd adjust this.
                if isinstance(parsed, list):
                    for item in parsed:
                        item["structure_processing_time"] = structure_processing_time
                else: # Handle cases where the output might not be a list (though schema suggests it is)
                    parsed["structure_processing_time"] = structure_processing_time

                output_filename = f"{os.path.splitext(filename)[0]}_extracted2.json"
                output_path = os.path.join(OUTPUT_DIR, output_filename)
                with open(output_path, "w", encoding="utf-8") as out_file:
                    json.dump(parsed, out_file, indent=2)
                #print(f"✅ Saved to {output_path}")
            except json.JSONDecodeError:
                print(f"❌ Failed to parse JSON response for {filename}")
                print("--- Model Output ---")
                print(result_text)
                print("--------------------")
            except IOError as e:
                print(f"❌ Error writing JSON file for {filename}: {e}")

            # Update the progress bar
            pbar.update(1)


if __name__ == "__main__":
    generate()

Processing JSON Files: 100%|██████████| 1/1 [00:10<00:00, 10.34s/it]
