**Cancellation Notices handling**
- Returns json files with info on the cancellation notices
- can derive the predecessor of ADs from that as well

In [None]:
import os
import base64
import json
from google import genai
from google.genai import types

# === CONFIGURATION ===
PDF_DIR = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A330\cancellation_notices"
# PDF_DIR = r"C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\Sample_of_a_sample"
OUTPUT_DIR = os.path.join(PDF_DIR, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate():
    client = genai.Client(
        vertexai=True,
        project="mthesis-450913",
        location="us-central1",
    )

    for filename in os.listdir(PDF_DIR):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(PDF_DIR, filename)
        print(f"\nProcessing: {filename}")

        # Load PDF
        with open(pdf_path, "rb") as f:
            pdf_data = f.read()

        # Instruction
        instruction = types.Part.from_text(
            text="""From this cancellation notice, extract the following information: 
            - Cancelled AD (e.g., 2003-0208), one cancellation notice can cancel many ADs
            - Replaced by (if mentioned, usually a different AD number)
            - Reference publications (e.g., service bulletins or foreign ADs, without dates)
            - Effective date (if available). The date in the documents could be in different formats, unify to YYYY-MM-DD"""
        )

        document = types.Part.from_bytes(
            data=pdf_data,
            mime_type="application/pdf",
        )

        contents = [
            types.Content(
                role="user",
                parts=[instruction, document]
            ),
        ]

        # Schema
        generate_content_config = types.GenerateContentConfig(
            temperature=0.3,
            top_p=0.95,
            max_output_tokens=2048,
            response_modalities=["TEXT"],
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
            ],
            response_mime_type="application/json",
            response_schema={
                "type": "object",
                "properties": {
                    "cancelled_ad": {
                        "type": "string",
                        "description": "The AD number(s) that this document cancels (e.g., 2010-0132)"
                    },
                    "replaced_by": {
                        "type": "string",
                        "nullable": True,
                        "description": "The AD number that replaces the cancelled AD, if mentioned"
                    },
                    "reference_publications": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "effective_date": {
                        "type": "string",
                        "format": "date",
                        "nullable": True,
                        "description": "The effective date of the cancellation if available (e.g., 2023-08-15)"
                    }
                },
                "required": ["cancelled_ad"],
                "propertyOrdering": [
                    "cancelled_ad",
                    "replaced_by",
                    "reference_publications",
                    "effective_date"
                ]
            }
        )

        # Call Gemini and collect response
        result_text = ""
        for chunk in client.models.generate_content_stream(
            model="gemini-2.0-flash-lite-001",
            contents=contents,
            config=generate_content_config,
        ):
            result_text += chunk.text

        # Parse and save JSON
        try:
            parsed = json.loads(result_text)
            output_path = os.path.join(OUTPUT_DIR, f"{os.path.splitext(filename)[0]}.json")
            with open(output_path, "w", encoding="utf-8") as out_file:
                json.dump(parsed, out_file, indent=2)
            print(f"✅ Saved to {output_path}")
        except json.JSONDecodeError:
            print(f"❌ Failed to parse JSON for {filename}")
            print(result_text)

generate()


**Removing non-english files**
- put all files in a separate directory

In [None]:
import os
import shutil
from langdetect import detect
import fitz  # PyMuPDF

# === CONFIGURATION ===
source_dir = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320"
non_english_dir = os.path.join(source_dir,r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\non_english")
char_threshold = 1000  # Number of characters to use for detection

# Create target folder if it doesn't exist
os.makedirs(non_english_dir, exist_ok=True)

# === FUNCTION TO EXTRACT TEXT FROM PDF ===
def extract_text(pdf_path, max_chars=1000):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
            if len(text) >= max_chars:
                break
        return text[:max_chars]
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# === MAIN LOOP ===
for filename in os.listdir(source_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(source_dir, filename)
        sample_text = extract_text(file_path)

        if not sample_text.strip():
            print(f"Skipping (empty or unreadable): {filename}")
            continue

        try:
            lang = detect(sample_text)
            if lang != "en":
                print(f"Detected {lang} – moving: {filename}")
                shutil.move(file_path, os.path.join(non_english_dir, filename))
            else:
                print(f"Detected English – keeping: {filename}")
        except Exception as e:
            print(f"Language detection failed for {filename}: {e}")

**Extracting information without response schema (just a draft of some solution, probably will use a schema)**

In [13]:
# Without response schema, can use this as last scenario
from google import genai
from google.genai import types
import json

# Define the expected enum values for validation
status_enum = ["active", "cancelled", "superseded"]
language_enum = ["en", "other"]
document_type_enum = ["AD", "notice"]

def generate_ad_json(prompt):
  client = genai.Client(
      vertexai=True,
      project="mthesis-450913",
      location="us-central1",
  )

  model = "gemini-2.0-flash-lite-001"
  contents = [
    types.Content(
      role="user",
      parts=[
        types.Part.from_text(text=prompt + """\n\nPlease return the information as a JSON object with the following keys: "ad_number", "aircraft_type", "status" (choose from 'active', 'cancelled', 'superseded'), "supersedes", "language" (choose 'en' or 'other'), "document_type" (choose 'AD' or 'notice'), "summary", and "full_text". Ensure the "supersedes" field is null if not applicable.""")
      ]
    )
  ]
  generate_content_config = types.GenerateContentConfig(
    temperature = 0.2,
    top_p = 0.95,
    max_output_tokens = 8192,
    response_mime_type="application/json", # Expecting JSON output
    # response_schema=response_schema,      # Remove response_schema for now
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
  )

  response = client.models.generate_content(
    model = model,
    contents = contents,
    config = generate_content_config,
  )

  try:
    structured_response = json.loads(response.text)
    print(json.dumps(structured_response, indent=2))
    # In a real application, you would then validate the 'status', 'language',
    # and 'document_type' fields against the enums defined at the top.
  except json.JSONDecodeError:
    print("Error: Could not decode the model's response as JSON.")
    print("Raw response text:")
    print(response.text)

# Example usage:
prompt = f"""
Please extract information from the following aviation document:
Airworthiness Directive 2008-0017R1, issued on 17 June 2008, applies to all Airbus A330 and A340 aircraft. It addresses the risk of uncontained Auxiliary Power Unit (APU) generator failures caused by Drive End Bearing (DEB) collapse, which can lead to structural damage and potential fire hazards. The directive mandates repetitive inspections of the APU generator’s scavenge filter, drain plug, and seal plate fitting, followed by corrective actions if needed. It supersedes previous directives 2008-0017 and 2007-0188R1. The referenced documents for compliance include Airbus AOT A330-24A3044 Revision 02, A340-24A4057 Revision 03, and A340-24A5021 Revision 02.
"""

generate_ad_json(prompt)

[
  {
    "ad_number": "2008-0017R1",
    "aircraft_type": "Airbus A330 and A340",
    "status": "active",
    "supersedes": [
      "2008-0017",
      "2007-0188R1"
    ],
    "language": "en",
    "document_type": "AD",
    "summary": "Addresses the risk of uncontained Auxiliary Power Unit (APU) generator failures caused by Drive End Bearing (DEB) collapse, which can lead to structural damage and potential fire hazards.",
    "full_text": "Airworthiness Directive 2008-0017R1, issued on 17 June 2008, applies to all Airbus A330 and A340 aircraft. It addresses the risk of uncontained Auxiliary Power Unit (APU) generator failures caused by Drive End Bearing (DEB) collapse, which can lead to structural damage and potential fire hazards. The directive mandates repetitive inspections of the APU generator\u2019s scavenge filter, drain plug, and seal plate fitting, followed by corrective actions if needed. It supersedes previous directives 2008-0017 and 2007-0188R1. The referenced documents f