**Cancellation Notices handling**
- Returns json files with info on the cancellation notices
- can derive the predecessor of ADs from that as well

In [None]:
import os
import base64
import json
from google import genai
from google.genai import types

# === CONFIGURATION ===
PDF_DIR = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A330\cancellation_notices"
# PDF_DIR = r"C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\Sample_of_a_sample"
OUTPUT_DIR = os.path.join(PDF_DIR, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate():
    client = genai.Client(
        vertexai=True,
        project="mthesis-450913",
        location="us-central1",
    )

    for filename in os.listdir(PDF_DIR):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(PDF_DIR, filename)
        print(f"\nProcessing: {filename}")

        # Load PDF
        with open(pdf_path, "rb") as f:
            pdf_data = f.read()

        # Instruction
        instruction = types.Part.from_text(
            text="""From this cancellation notice, extract the following information: 
            - Cancelled AD (e.g., 2003-0208), one cancellation notice can cancel many ADs
            - Replaced by (if mentioned, usually a different AD number)
            - Reference publications (e.g., service bulletins or foreign ADs, without dates)
            - Effective date (if available). The date in the documents could be in different formats, unify to YYYY-MM-DD"""
        )

        document = types.Part.from_bytes(
            data=pdf_data,
            mime_type="application/pdf",
        )

        contents = [
            types.Content(
                role="user",
                parts=[instruction, document]
            ),
        ]

        # Schema
        generate_content_config = types.GenerateContentConfig(
            temperature=0.3,
            top_p=0.95,
            max_output_tokens=2048,
            response_modalities=["TEXT"],
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
            ],
            response_mime_type="application/json",
            response_schema={
                "type": "object",
                "properties": {
                    "cancelled_ad": {
                        "type": "string",
                        "description": "The AD number(s) that this document cancels (e.g., 2010-0132)"
                    },
                    "replaced_by": {
                        "type": "string",
                        "nullable": True,
                        "description": "The AD number that replaces the cancelled AD, if mentioned"
                    },
                    "reference_publications": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "effective_date": {
                        "type": "string",
                        "format": "date",
                        "nullable": True,
                        "description": "The effective date of the cancellation if available (e.g., 2023-08-15)"
                    }
                },
                "required": ["cancelled_ad"],
                "propertyOrdering": [
                    "cancelled_ad",
                    "replaced_by",
                    "reference_publications",
                    "effective_date"
                ]
            }
        )

        # Call Gemini and collect response
        result_text = ""
        for chunk in client.models.generate_content_stream(
            model="gemini-2.0-flash-lite-001",
            contents=contents,
            config=generate_content_config,
        ):
            result_text += chunk.text

        # Parse and save JSON
        try:
            parsed = json.loads(result_text)
            output_path = os.path.join(OUTPUT_DIR, f"{os.path.splitext(filename)[0]}.json")
            with open(output_path, "w", encoding="utf-8") as out_file:
                json.dump(parsed, out_file, indent=2)
            print(f"✅ Saved to {output_path}")
        except json.JSONDecodeError:
            print(f"❌ Failed to parse JSON for {filename}")
            print(result_text)

generate()


**Removing non-english files**
- put all files in a separate directory

In [None]:
import os
import shutil
from langdetect import detect
import fitz  # PyMuPDF

# === CONFIGURATION ===
source_dir = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320"
non_english_dir = os.path.join(source_dir,r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\non_english")
char_threshold = 1000  # Number of characters to use for detection

# Create target folder if it doesn't exist
os.makedirs(non_english_dir, exist_ok=True)

# === FUNCTION TO EXTRACT TEXT FROM PDF ===
def extract_text(pdf_path, max_chars=1000):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
            if len(text) >= max_chars:
                break
        return text[:max_chars]
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# === MAIN LOOP ===
for filename in os.listdir(source_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(source_dir, filename)
        sample_text = extract_text(file_path)

        if not sample_text.strip():
            print(f"Skipping (empty or unreadable): {filename}")
            continue

        try:
            lang = detect(sample_text)
            if lang != "en":
                print(f"Detected {lang} – moving: {filename}")
                shutil.move(file_path, os.path.join(non_english_dir, filename))
            else:
                print(f"Detected English – keeping: {filename}")
        except Exception as e:
            print(f"Language detection failed for {filename}: {e}")

**Updated cancellation notices handling - input in json**


In [3]:
import os
import json
from google import genai
from google.genai import types

# === CONFIGURATION ===
# This directory should now contain your JSON files with the raw text.
INPUT_DIR = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A330\cancellation_notices\output_json_files"
# INPUT_DIR = r"C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\Sample_of_a_sample"
OUTPUT_DIR = os.path.join(INPUT_DIR, "output_structured")
os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate():
    """
    Processes JSON files containing raw text, extracts structured data using the Gemini API,
    and saves the results as new JSON files.
    """
    client = genai.Client(
        vertexai=True,
        project="mthesis-450913",
        location="us-central1",
    )

    for filename in os.listdir(INPUT_DIR):
        # Process only .json files
        if not filename.lower().endswith(".json"):
            continue

        input_path = os.path.join(INPUT_DIR, filename)
        print(f"\nProcessing: {filename}")

        # Load the JSON file and extract the raw text from the "text" key
        try:
            with open(input_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                raw_text = data.get("text") # Get text from the JSON object
                if not raw_text:
                    print(f"⚠️  Skipping {filename}: No 'text' key found or content is empty.")
                    continue
        except (json.JSONDecodeError, IOError) as e:
            print(f"❌ Error reading or parsing {filename}: {e}")
            continue

        # Instruction for the model
        instruction = types.Part.from_text(
            text="""From this cancellation notice, extract the following information: 
            - Cancelled AD (e.g., 2003-0208), one cancellation notice can cancel many ADs
            - Replaced by (if mentioned, usually a different AD number)
            - Reference publications (e.g., service bulletins or foreign ADs, without dates)
            - Effective date (if available). The date in the documents could be in different formats, unify to YYYY-MM-DD"""
        )
        
        # Create the document part from the extracted raw text
        document = types.Part.from_text(text=raw_text)

        contents = [
            types.Content(
                role="user",
                parts=[instruction, document]
            ),
        ]

        # Define the generation configuration and the desired JSON output schema
        generate_content_config = types.GenerateContentConfig(
            temperature=0.3,
            top_p=0.95,
            max_output_tokens=2048,
            response_modalities=["TEXT"],
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
            ],
            response_mime_type="application/json",
            response_schema={
                "type": "object",
                "properties": {
                    "cancelled_ad": {
                        "type": "string",
                        "description": "The AD number(s) that this document cancels (e.g., 2010-0132)"
                    },
                    "replaced_by": {
                        "type": "string",
                        "nullable": True,
                        "description": "The AD number that replaces the cancelled AD, if mentioned"
                    },
                    "reference_publications": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "effective_date": {
                        "type": "string",
                        "format": "date",
                        "nullable": True,
                        "description": "The effective date of the cancellation if available (e.g., 2023-08-15)"
                    }
                },
                "required": ["cancelled_ad"],
                "propertyOrdering": [
                    "cancelled_ad",
                    "replaced_by",
                    "reference_publications",
                    "effective_date"
                ]
            }
        )

        # Call Gemini and collect the streaming response
        result_text = ""
        try:
            for chunk in client.models.generate_content_stream(
                model="gemini-2.0-flash-lite-001",
                contents=contents,
                config=generate_content_config,
            ):
                result_text += chunk.text
        except Exception as e:
            print(f"❌ An error occurred during API call for {filename}: {e}")
            continue

        # Parse the JSON response and save it to a file
        try:
            parsed = json.loads(result_text)
            output_filename = f"{os.path.splitext(filename)[0]}_extracted.json"
            output_path = os.path.join(OUTPUT_DIR, output_filename)
            with open(output_path, "w", encoding="utf-8") as out_file:
                json.dump(parsed, out_file, indent=2)
            print(f"✅ Saved to {output_path}")
        except json.JSONDecodeError:
            print(f"❌ Failed to parse JSON response for {filename}")
            print("--- Model Output ---")
            print(result_text)
            print("--------------------")

if __name__ == "__main__":
    generate()


Processing: AD_2010-0083-CN_1.json
✅ Saved to C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A330\cancellation_notices\output_json_files\output_structured\AD_2010-0083-CN_1_extracted.json

Processing: AD_2010-0132-CN_1.json
✅ Saved to C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A330\cancellation_notices\output_json_files\output_structured\AD_2010-0132-CN_1_extracted.json

Processing: AD_2013-0251-CN_1.json
✅ Saved to C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A330\cancellation_notices\output_json_files\output_structured\AD_2013-0251-CN_1_extracted.json

Processing: AD_2014-0257-CN_1.json
✅ Saved to C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A330\cancellation_notices\output_json_files\output_structured\AD_2014-0257-CN_1_extracted.json

Processing: AD_2016-0065-CN_1.json
✅ Saved to C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A330\cancellation_notices\output_json_files\output_structured\AD_2016-0065-CN_1_extracted.json

Processing: AD_2018-0034-CN_1