getting the AD list of names for the llm prompt

In [18]:
import os

# === CONFIGURATION ===
PDF_DIR = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\mini test data"

def get_ad_identifiers():
    ad_list = []
    for filename in os.listdir(PDF_DIR):
        if filename.lower().endswith(".pdf"):
            ad_name = os.path.splitext(filename)[0]  # removes ".pdf"
            ad_list.append(ad_name)
    return ad_list

# Example usage

choosing random 20 ads as ground truth dataset

In [17]:
import random

items=get_ad_identifiers()
def choose_random_20(items):
    if len(items) < 20:
        raise ValueError("List has fewer than 20 elements.")
    return random.sample(items, 20)

choose_random_20(items)

['AD_2021-0236_1',
 'AD_2019-0056_1',
 'AD_2019-0106_1',
 'AD_2020-0148_1',
 'AD_2021-0002R1_1',
 'AD_2022-0115_2',
 'AD_2020-0040R1_1',
 'AD_2021-0172_2',
 'AD_2022-0185_1',
 'AD_2020-0118_1',
 'AD_2022-0030_1',
 'AD_2019-0173_1',
 'AD_2022-0032R1_1',
 'AD_2020-0219_1',
 'AD_2019-0189_1',
 'AD_2021-0279_2',
 'AD_2022-0147_1',
 'AD_2021-0256_1',
 'AD_2020-0053_1',
 'AD_2020-0250_1']

asks for all info regarding applicability of an ad based just on it's number, saves the llm output in json files.

In [20]:
from google import genai
from google.genai import types
import json
import os
import time  # ⏱️ for timing

# === CONFIGURATION ===
INPUT_DIR = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\mini test data"
OUTPUT_DIR = INPUT_DIR  # save output JSONs in the same folder
os.makedirs(OUTPUT_DIR, exist_ok=True)


def generate():
    client = genai.Client(
        vertexai=True,
        project="mthesis-450913",
        location="global",
    )

    ad_identifiers = get_ad_identifiers()

    for ad_id in ad_identifiers:
        print(f"\n📄 Processing: {ad_id}")
        start_time = time.time()  # Start timer

        instruction = types.Part.from_text(
            text=f"""Extract all information regarding applicability from Airworthiness Directive {ad_id} in JSON format if possible."""
        )

        contents = [
            types.Content(
                role="user",
                parts=[instruction]
            ),
        ]

        generate_content_config = types.GenerateContentConfig(
            temperature=1,
            top_p=0.95,
            max_output_tokens=8192,
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
            ],
        )

        result_text = ""
        try:
            for chunk in client.models.generate_content_stream(
                model="gemini-2.0-flash-lite-001",
                contents=contents,
                config=generate_content_config,
            ):
                result_text += chunk.text
        except Exception as e:
            print(f"❌ Error with {ad_id}: {e}")
            continue

        elapsed = round(time.time() - start_time, 2)  # Time in seconds

        # Try to parse JSON, fallback to raw
        try:
            parsed = json.loads(result_text)
        except json.JSONDecodeError:
            parsed = {"raw_output": result_text}

        # ⏱️ Add time to output
        parsed["processing_time_seconds"] = elapsed

        output_path = os.path.join(OUTPUT_DIR, f"{ad_id}_extracted.json")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(parsed, f, indent=2)

        print(f"✅ Output saved to: {output_path} ({elapsed} sec)")

if __name__ == "__main__":
    generate()


📄 Processing: AD_2018-0289R1_1
✅ Output saved to: C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\mini test data\AD_2018-0289R1_1_extracted.json (9.37 sec)

📄 Processing: AD_2019-0122_1
✅ Output saved to: C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\mini test data\AD_2019-0122_1_extracted.json (1.89 sec)

📄 Processing: AD_2022-0082_1
✅ Output saved to: C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320\directives\sample dataset\mini test data\AD_2022-0082_1_extracted.json (6.64 sec)
