In [ ]:
#separating ADs and other files


In [18]:
#removing non-english files 

import os
import shutil
from langdetect import detect
import fitz  # PyMuPDF

# === CONFIGURATION ===
source_dir = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320"
non_english_dir = os.path.join(source_dir,r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\non_english")
char_threshold = 1000  # Number of characters to use for detection

# Create target folder if it doesn't exist
os.makedirs(non_english_dir, exist_ok=True)

# === FUNCTION TO EXTRACT TEXT FROM PDF ===
def extract_text(pdf_path, max_chars=1000):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
            if len(text) >= max_chars:
                break
        return text[:max_chars]
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# === MAIN LOOP ===
for filename in os.listdir(source_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(source_dir, filename)
        sample_text = extract_text(file_path)

        if not sample_text.strip():
            print(f"Skipping (empty or unreadable): {filename}")
            continue

        try:
            lang = detect(sample_text)
            if lang != "en":
                print(f"Detected {lang} – moving: {filename}")
                shutil.move(file_path, os.path.join(non_english_dir, filename))
            else:
                print(f"Detected English – keeping: {filename}")
        except Exception as e:
            print(f"Language detection failed for {filename}: {e}")

Detected English – keeping: AD_2006-0069R1_1.pdf
Detected English – keeping: AD_2006-0108_1.pdf
Detected English – keeping: AD_2006-0112R1_1.pdf
Detected English – keeping: AD_2006-0135-CN_1.pdf
Detected English – keeping: AD_2006-0136_1.pdf
Detected English – keeping: AD_2006-0153R2_2.pdf
Detected English – keeping: AD_2006-0162_1.pdf
Detected English – keeping: AD_2006-0165_1.pdf
Detected English – keeping: AD_2006-0174_2.pdf
Detected English – keeping: AD_2006-0176_1.pdf
Detected English – keeping: AD_2006-0184_1.pdf
Detected English – keeping: AD_2006-0203_1.pdf
Detected English – keeping: AD_2006-0222_1.pdf
Detected English – keeping: AD_2006-0223_1.pdf
Detected English – keeping: AD_2006-0236R1_1.pdf
Detected English – keeping: AD_2006-0262_1.pdf
Detected English – keeping: AD_2006-0280_1.pdf
Detected English – keeping: AD_2007-0036R1_1.pdf
Detected English – keeping: AD_2007-0064R1_1.pdf
Detected English – keeping: AD_2007-0065R2_1.pdf
Detected English – keeping: AD_2007-0067R1_

In [6]:
#working connection
from google import genai
from google.genai import types
import base64

def generate():
  client = genai.Client(
      vertexai=True,
      project="mthesis-450913",
      location="us-central1",
  )


  model = "gemini-2.0-flash-lite-001"
  contents = [
    types.Content(
      role="user",
      parts=[
        types.Part.from_text(text="""Telle me valenis day joke on polish.""")
      ]
    )
  ]
  generate_content_config = types.GenerateContentConfig(
    temperature = 0.2,
    top_p = 0.95,
    max_output_tokens = 8192,
    response_modalities = ["TEXT"],
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
  )

  for chunk in client.models.generate_content_stream(
    model = model,
    contents = contents,
    config = generate_content_config,
    ):
    print(chunk.text, end="")

generate()

Okay, here's a Valentine's Day joke in Polish, with an English translation:
**Polish Joke:**

> Dlaczego informatyk się oświadczył? Bo pomyślał, że to idealny moment na "merge request".

**English Translation:**

> Why did the IT guy propose? Because he thought it was the perfect moment for a "merge request."

**Explanation:**

*   **"Merge request"** is a term used in software development. When programmers work on different features of a program, they "merge" their code together to create the finished product.
*   The joke plays on the IT guy's love for technology and how he uses it to symbolize commitment.

I hope you liked it!


In [14]:
from google import genai
from google.genai import types
import json

response_schema = {
  "type": "OBJECT",
  "properties": {
    "ad_number": {
      "type": "STRING",
      "description": "The unique identifier for the airworthiness directive."
    },
    "aircraft_type": {
      "type": "STRING",
      "description": "The specific type or model of aircraft the directive applies to."
    },
    "status": {
      "type": "STRING",
      "description": "The current status of the airworthiness directive.",
      "enum": [
        "active",
        "cancelled",
        "superseded"
      ]
    },
    "language": {
      "type": "STRING",
      "description": "The language of the document.",
      "enum": [
        "en",
        "other"
      ]
    },
    "document_type": {
      "type": "STRING",
      "description": "The type of document.",
      "enum": [
        "AD",
        "notice"
      ]
    },
    "summary": {
      "type": "STRING",
      "description": "A brief overview or abstract of the airworthiness directive."
    },
    "full_text": {
      "type": "STRING",
      "description": "The complete text content of the airworthiness directive."
    }
  },
  "required": [
    "ad_number",
    "aircraft_type",
    "status",
    "language",
    "document_type",
    "summary",
    "full_text"
  ]
}

def generate_ad_json(prompt):
  client = genai.Client(
      vertexai=True,
      project="mthesis-450913",
      location="us-central1",
  )

  model = "gemini-2.0-flash-lite-001"
  contents = [
    types.Content(
      role="user",
      parts=[
        types.Part.from_text(text=prompt) # <--- HERE is where you put your prompt
      ]
    )
  ]
  generate_content_config = types.GenerateContentConfig(
    temperature = 0.2,
    top_p = 0.95,
    max_output_tokens = 8192,
    response_mime_type="application/json; schema=" + json.dumps(response_schema),
    response_schema=response_schema,      # Pass your response schema here
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
  )

  response = client.models.generate_content(
    model = model,
    contents = contents,
    config = generate_content_config,
  )

  try:
    # Attempt to parse the response text as JSON and print it directly
    structured_response = json.loads(response.text)
    print(json.dumps(structured_response, indent=2))
  except json.JSONDecodeError:
    print("Error: Could not decode the model's response as JSON.")
    print("Raw response text:")
    print(response.text)

# Example usage:
ad_document_text = """
Airworthiness Directives; Boeing Model 737-800, -900, and -900ER Series Airplanes
AD Number: 2023-0176
Status: active
Supersedes: AD 2020-0045
Language: en
Document Type: AD
Summary: This AD requires repetitive inspections of the left and right wing lower skin for cracking and repair if necessary.
Full Text: (The full text of the AD would go here...)
"""

prompt = f"""
Please extract information from the following aviation document and structure it according to the provided JSON schema. Return only the JSON object:

{ad_document_text}
"""

generate_ad_json(prompt)

ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': 'Found unsupported response mime type: \'application/json; schema={"type": "OBJECT", "properties": {"ad_number": {"type": "STRING", "description": "The unique identifier for the airworthiness directive."}, "aircraft_type": {"type": "STRING", "description": "The specific type or model of aircraft the directive applies to."}, "status": {"type": "STRING", "description": "The current status of the airworthiness directive.", "enum": ["active", "cancelled", "superseded"]}, "language": {"type": "STRING", "description": "The language of the document.", "enum": ["en", "other"]}, "document_type": {"type": "STRING", "description": "The type of document.", "enum": ["AD", "notice"]}, "summary": {"type": "STRING", "description": "A brief overview or abstract of the airworthiness directive."}, "full_text": {"type": "STRING", "description": "The complete text content of the airworthiness directive."}}, "required": ["ad_number", "aircraft_type", "status", "language", "document_type", "summary", "full_text"]}\' for a multi-modal generation request.', 'status': 'INVALID_ARGUMENT'}}

In [13]:
# Without response schema, can use this as last scenario
from google import genai
from google.genai import types
import json

# Define the expected enum values for validation
status_enum = ["active", "cancelled", "superseded"]
language_enum = ["en", "other"]
document_type_enum = ["AD", "notice"]

def generate_ad_json(prompt):
  client = genai.Client(
      vertexai=True,
      project="mthesis-450913",
      location="us-central1",
  )

  model = "gemini-2.0-flash-lite-001"
  contents = [
    types.Content(
      role="user",
      parts=[
        types.Part.from_text(text=prompt + """\n\nPlease return the information as a JSON object with the following keys: "ad_number", "aircraft_type", "status" (choose from 'active', 'cancelled', 'superseded'), "supersedes", "language" (choose 'en' or 'other'), "document_type" (choose 'AD' or 'notice'), "summary", and "full_text". Ensure the "supersedes" field is null if not applicable.""")
      ]
    )
  ]
  generate_content_config = types.GenerateContentConfig(
    temperature = 0.2,
    top_p = 0.95,
    max_output_tokens = 8192,
    response_mime_type="application/json", # Expecting JSON output
    # response_schema=response_schema,      # Remove response_schema for now
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
  )

  response = client.models.generate_content(
    model = model,
    contents = contents,
    config = generate_content_config,
  )

  try:
    structured_response = json.loads(response.text)
    print(json.dumps(structured_response, indent=2))
    # In a real application, you would then validate the 'status', 'language',
    # and 'document_type' fields against the enums defined at the top.
  except json.JSONDecodeError:
    print("Error: Could not decode the model's response as JSON.")
    print("Raw response text:")
    print(response.text)

# Example usage:
prompt = f"""
Please extract information from the following aviation document:
Airworthiness Directive 2008-0017R1, issued on 17 June 2008, applies to all Airbus A330 and A340 aircraft. It addresses the risk of uncontained Auxiliary Power Unit (APU) generator failures caused by Drive End Bearing (DEB) collapse, which can lead to structural damage and potential fire hazards. The directive mandates repetitive inspections of the APU generator’s scavenge filter, drain plug, and seal plate fitting, followed by corrective actions if needed. It supersedes previous directives 2008-0017 and 2007-0188R1. The referenced documents for compliance include Airbus AOT A330-24A3044 Revision 02, A340-24A4057 Revision 03, and A340-24A5021 Revision 02.
"""

generate_ad_json(prompt)

[
  {
    "ad_number": "2008-0017R1",
    "aircraft_type": "Airbus A330 and A340",
    "status": "active",
    "supersedes": [
      "2008-0017",
      "2007-0188R1"
    ],
    "language": "en",
    "document_type": "AD",
    "summary": "Addresses the risk of uncontained Auxiliary Power Unit (APU) generator failures caused by Drive End Bearing (DEB) collapse, which can lead to structural damage and potential fire hazards.",
    "full_text": "Airworthiness Directive 2008-0017R1, issued on 17 June 2008, applies to all Airbus A330 and A340 aircraft. It addresses the risk of uncontained Auxiliary Power Unit (APU) generator failures caused by Drive End Bearing (DEB) collapse, which can lead to structural damage and potential fire hazards. The directive mandates repetitive inspections of the APU generator\u2019s scavenge filter, drain plug, and seal plate fitting, followed by corrective actions if needed. It supersedes previous directives 2008-0017 and 2007-0188R1. The referenced documents f