<a href="https://colab.research.google.com/github/benjawad/AI-for-Environmental-Regulation-Analysis/blob/main/commitment_register_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [124]:
!pip install google-generativeai tqdm reportlab PyPDF2 gradio pymupdf
!pip install pytesseract Pillow arabic_reshaper python-bidi
!pip install selenium webdriver-manager beautifulsoup4 requests playwright  chromium
!playwright install > /dev/null 2>&1

Collecting playwright
  Downloading playwright-1.54.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting chromium
  Downloading chromium-0.0.0-py3-none-any.whl.metadata (615 bytes)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.54.0-py3-none-manylinux1_x86_64.whl (45.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading chromium-0.0.0-py3-none-any.whl (2.4 kB)
Downloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, chromium, playwright
Successfully installed chromium-0.0.0 playwright-1.54.0 pyee-13.0.0


# web Scrapping

In [126]:
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from playwright.async_api import async_playwright

BASE_URL = "https://environnement.gov.ma/fr/lois-et-reglementations/normes"
pdf_links = set()
MAX_PAGES = 4

async def extract_pdfs_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    new_links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".pdf"):
            full_link = urljoin(BASE_URL, href)
            new_links.add(full_link)

    return new_links

async def run():
    async with async_playwright() as p:
        # Lancement du navigateur avec user-agent personnalisé
        browser = await p.chromium.launch(headless=True)  # Mettre True en production
        context = await browser.new_context(user_agent=(
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/115.0 Safari/537.36"
        ))
        page = await context.new_page()

        print(f"🔗 Accès à la page : {BASE_URL}")
        await page.goto(BASE_URL)
        await page.wait_for_load_state("networkidle")
        await asyncio.sleep(5)  # Laisser le temps au contenu dynamique

        # --- Option 1: Extraire les liens directement depuis la page principale ---
        print("🔍 Extraction depuis la page principale...")
        main_html = await page.content()
        new_links = await extract_pdfs_from_html(main_html)
        pdf_links.update(new_links)

        print(f"📄 {len(pdf_links)} liens PDF trouvés dans la page principale")

        # --- Option 2: Exploration des iframes pour trouver des PDF supplémentaires ---
        frames = page.frames
        print(f"🧩 {len(frames)} frames détectées")

        for frame in frames:
            print("   - Frame URL:", frame.url)
            try:
                frame_html = await frame.content()
                frame_links = await extract_pdfs_from_html(frame_html)
                count = len(frame_links - pdf_links)
                pdf_links.update(frame_links)
                print(f"   ✅ {count} nouveaux PDFs trouvés dans cette frame")
            except Exception as e:
                print(f"   ⚠️ Impossible de lire la frame : {e}")

        await browser.close()

        print(f"\n✅ Scraping terminé. Total de liens PDF trouvés : {len(pdf_links)}")
        for link in sorted(pdf_links):
            print(link)

# Exécution
import nest_asyncio
nest_asyncio.apply()

await run()


🔗 Accès à la page : https://environnement.gov.ma/fr/lois-et-reglementations/normes
🔍 Extraction depuis la page principale...
📄 17 liens PDF trouvés dans la page principale
🧩 1 frames détectées
   - Frame URL: https://environnement.gov.ma/fr/lois-et-reglementations/normes
   ✅ 0 nouveaux PDFs trouvés dans cette frame

✅ Scraping terminé. Total de liens PDF trouvés : 17
https://environnement.gov.ma/PDFs/LETTRE_ROYALE.pdf
https://environnement.gov.ma/PDFs/decretCNE.pdf
https://environnement.gov.ma/images/Normes/Air/Normes_de_la_qualité_de_lair.pdf
https://environnement.gov.ma/images/Normes/Air/Seuils_dinformation_et_seuils_dalerte.pdf
https://environnement.gov.ma/images/Normes/Air/Valeurs_limites_générales_des_rejets_atmosphériques.pdf
https://environnement.gov.ma/images/Normes/Air/Valeurs_limites_sectorielles__céramique.pdf
https://environnement.gov.ma/images/Normes/Air/valeurs_limites_spécifiques_du_secteur_cimentier.pdf
https://environnement.gov.ma/images/Normes/Eau/Normes_qualité__des

In [132]:
import os
import fitz  # PyMuPDF
import requests

os.makedirs("/content/pdfs", exist_ok=True)
pdf_texts = []

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115 Safari/537.36"
}

for url in pdf_links:
    filename = os.path.basename(url)
    path = os.path.join("/content/pdfs", filename)

    try:
        r = requests.get(url, headers=headers, timeout=10)

        content_type = r.headers.get("Content-Type", "").lower()
        if r.status_code == 200 and "application/pdf" in content_type:
            with open(path, "wb") as f:
                f.write(r.content)

            # Essayer d'ouvrir le fichier PDF avec fitz
            try:
                doc = fitz.open(path)
                text = ""
                for page in doc:
                    text += page.get_text()
                pdf_texts.append((filename, text[:1000]))  # Enregistre un extrait
            except Exception as e:
                print(f"🗑️ PDF corrompu (lecture impossible): {filename} -> supprimé")
                os.remove(path)
        else:
            print(f"❌ Mauvais type de contenu pour {filename} ({content_type})")
    except Exception as e:
        print(f"⚠️ Erreur pour {filename}: {e}")

# Afficher un échantillon des textes extraits
for name, content in pdf_texts[:2]:
    print(f"\n📄 {name}\n")
pdf_names = []
for name, content in pdf_texts:
    pdf_names.append("/pdfs/"+name)




📄 Valeurs_limites_sectorielles__céramique.pdf


📄 Normes_de_la_qualité_de_lair.pdf



# Pdf Parsing

In [133]:
import fitz  # PyMuPDF
import pandas as pd
import json
import re
import os
import io
import traceback
import numpy as np
from PIL import Image
import pytesseract
from concurrent.futures import ThreadPoolExecutor
import logging
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass

# Configuration du logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class TableValidationConfig:
    """Configuration pour la validation des tableaux"""
    max_columns: int = 12
    min_rows: int = 2
    max_null_percentage: float = 0.4
    min_content_ratio: float = 0.3

class RobustPDFParser:
    """
    Parser PDF de production robuste qui adresse les problèmes identifiés :
    - Stratégie multi-approche pour les différents types de documents
    - Gestion robuste des mises en page complexes
    - OCR intégré avec détection automatique
    - Validation stricte des tableaux pour éviter les hallucinations
    - Classification intelligente des documents
    """

    # Configurations pour la détection des polluants (enrichie)
    POLLUTANT_MAPPING = {
        'SO2': ['dioxyde de soufre', 'so2', 'sulphur dioxide', 'anhydride sulfureux'],
        'NOx': ['oxydes d\'azote', 'nox', 'nitrogen oxides', 'monoxyde d\'azote', 'dioxyde d\'azote'],
        'COV': ['composés organiques volatils', 'cov', 'volatile organic compounds', 'voc'],
        'PM10': ['particules pm10', 'pm10', 'particulate matter 10', 'poussières pm10'],
        'PM2.5': ['particules pm2.5', 'pm2.5', 'particules fines'],
        'Hg': ['mercure', 'mercury', 'hg'],
        'Pb': ['plomb', 'lead', 'pb'],
        'Cd': ['cadmium', 'cd'],
        'O3': ['ozone', 'o3'],
        'CO': ['monoxyde de carbone', 'co', 'carbon monoxide'],
        'Benzène': ['benzène', 'benzene', 'c6h6'],
        'H2S': ['sulfure d\'hydrogène', 'h2s', 'hydrogen sulfide'],
        'NH3': ['ammoniac', 'nh3', 'ammonia'],
        'Fluorures': ['fluorures', 'fluorides', 'hf'],
        'Chlorures': ['chlorures', 'chlorides', 'hcl']
    }

    def __init__(self, pdf_path: str, config: Optional[TableValidationConfig] = None):
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"Fichier introuvable: {pdf_path}")

        self.pdf_path = pdf_path
        self.filename = os.path.basename(pdf_path)
        self.doc = fitz.open(pdf_path)
        self.config = config or TableValidationConfig()

        # Analyse préliminaire du document
        self.doc_analysis = self._analyze_document_structure()
        self.doc_type = self._classify_document()
        self.metadata = self._extract_metadata()

    def _analyze_document_structure(self) -> Dict[str, Any]:
        """Analyse la structure générale du document"""
        analysis = {
            'total_pages': len(self.doc),
            'scanned_pages': 0,
            'text_pages': 0,
            'table_pages': 0,
            'mixed_pages': 0,
            'avg_text_density': 0,
            'has_images': False
        }

        text_densities = []

        for page_num in range(min(5, len(self.doc))):  # Analyse des 5 premières pages
            page = self.doc.load_page(page_num)

            # Analyse du texte
            text = page.get_text()
            text_density = len(text.strip()) / (page.rect.width * page.rect.height) * 10000
            text_densities.append(text_density)

            # Détection du type de page
            if self._is_scanned_page(page):
                analysis['scanned_pages'] += 1
            elif self._has_complex_tables(page):
                analysis['table_pages'] += 1
            elif text_density > 5:
                analysis['text_pages'] += 1
            else:
                analysis['mixed_pages'] += 1

            # Détection d'images
            if page.get_images():
                analysis['has_images'] = True

        analysis['avg_text_density'] = np.mean(text_densities) if text_densities else 0
        return analysis

    def _classify_document(self) -> str:
        """Classification intelligente basée sur le nom et le contenu"""
        filename_lower = self.filename.lower()

        # Classification par nom de fichier
        filename_patterns = [
            (r'valeurs?.*limites?.*générales?.*atmosphérique', 'vlg_atmospherique'),
            (r'valeurs?.*limites?.*générales?.*liquide', 'vlg_liquide'),
            (r'valeurs?.*limites?.*sectorielles?.*ciment', 'vls_ciment'),
            (r'valeurs?.*limites?.*sectorielles?.*céramique', 'vls_ceramique'),
            (r'valeurs?.*limites?.*sectorielles?', 'vls_autre'),
            (r'normes?.*qualité.*air', 'normes_air'),
            (r'normes?.*qualité.*eau', 'normes_eau'),
            (r'seuils?.*information.*alerte', 'seuils'),
            (r'décret|decret', 'decret'),
            (r'lettre.*royale', 'lettre_royale'),
            (r'irrigation', 'irrigation')
        ]

        for pattern, doc_type in filename_patterns:
            if re.search(pattern, filename_lower):
                return doc_type

        # Classification par contenu (analyse des premières pages)
        content_keywords = self._extract_content_keywords()
        if 'valeur' in content_keywords and 'limite' in content_keywords:
            if 'atmosphérique' in content_keywords or 'air' in content_keywords:
                return 'vlg_atmospherique'
            elif 'liquide' in content_keywords or 'eau' in content_keywords:
                return 'vlg_liquide'

        return 'autre'

    def _extract_content_keywords(self) -> List[str]:
        """Extrait les mots-clés des premières pages pour la classification"""
        keywords = []
        for page_num in range(min(3, len(self.doc))):
            page = self.doc.load_page(page_num)
            text = page.get_text().lower()

            # Extraction de mots-clés pertinents
            key_terms = [
                'valeur', 'limite', 'norme', 'seuil', 'décret',
                'atmosphérique', 'air', 'liquide', 'eau',
                'ciment', 'céramique', 'textile', 'sucre',
                'pollution', 'émission', 'rejet', 'qualité'
            ]

            for term in key_terms:
                if term in text:
                    keywords.append(term)

        return keywords

    def _extract_metadata(self) -> Dict[str, Any]:
        """Extraction enrichie des métadonnées"""
        meta = self.doc.metadata
        return {
            "title": meta.get("title", ""),
            "author": meta.get("author", ""),
            "creation_date": self._parse_pdf_date(meta.get("creationDate")),
            "modification_date": self._parse_pdf_date(meta.get("modDate")),
            "keywords": meta.get("keywords", ""),
            "pages": len(self.doc),
            "file_size": os.path.getsize(self.pdf_path),
            "analysis": self.doc_analysis
        }

    def _parse_pdf_date(self, date_str: Optional[str]) -> str:
        """Convertit les dates PDF en format ISO"""
        if not date_str:
            return ""
        try:
            if date_str.startswith("D:") and len(date_str) >= 10:
                return f"{date_str[2:6]}-{date_str[6:8]}-{date_str[8:10]}"
        except:
            pass
        return date_str

    def _is_scanned_page(self, page) -> bool:
        """Détection améliorée des pages scannées"""
        # 1. Densité de texte très faible
        text = page.get_text()
        if len(text.strip()) > 500:  # Seuil plus élevé
            return False

        # 2. Présence d'images de grande taille
        images = page.get_images()
        if not images:
            return False

        for img in images:
            # Vérification des dimensions et de la résolution
            if img[2] > 400 and img[3] > 400:  # Largeur et hauteur minimales
                return True

        # 3. Rapport texte/surface très faible
        text_density = len(text.strip()) / (page.rect.width * page.rect.height) * 10000
        return text_density < 2

    def _has_complex_tables(self, page) -> bool:
        """Détecte si une page contient des tableaux complexes"""
        try:
            tables = page.find_tables()
            if not tables:
                return False

            # Vérification de la complexité des tableaux
            for table in tables:
                df = table.to_pandas()
                if len(df.columns) > 3 and len(df) > 3:
                    return True
            return False
        except:
            return False

    def _ocr_page(self, page) -> str:
        """OCR robuste avec gestion d'erreurs"""
        try:
            # Extraction avec haute résolution
            pix = page.get_pixmap(dpi=300)
            img_data = pix.tobytes()

            if not img_data:
                return ""

            img = Image.open(io.BytesIO(img_data))

            # Configuration OCR optimisée
            custom_config = r'--oem 3 --psm 6'

            # Essai avec différentes langues
            languages = ['fra', 'ara+fra', 'eng']

            for lang in languages:
                try:
                    text = pytesseract.image_to_string(
                        img,
                        lang=lang,
                        config=custom_config
                    )
                    if text.strip() and len(text.strip()) > 20:
                        return self._clean_text(text)
                except:
                    continue

            # Dernier recours sans spécification de langue
            text = pytesseract.image_to_string(img, config=custom_config)
            return self._clean_text(text)

        except Exception as e:
            logger.warning(f"Échec OCR page: {str(e)}")
            return ""

    def _clean_text(self, text: str) -> str:
        """Nettoyage approfondi du texte"""
        if not text:
            return ""

        # Remplacement des caractères problématiques
        replacements = {
            '\xad': '',      # Soft hyphen
            '\uf0b7': '•',   # Bullet
            'ﬁ': 'fi',
            'ﬂ': 'fl',
            '\u200b': '',    # Zero-width space
            '\u202f': ' ',   # Narrow no-break space
            '\ufeff': '',    # BOM
        }

        for char, replacement in replacements.items():
            text = text.replace(char, replacement)

        # Normalisation des espaces
        text = re.sub(r'\s+', ' ', text).strip()

        # Suppression des lignes très courtes (souvent du bruit OCR)
        lines = text.split('\n')
        cleaned_lines = [line for line in lines if len(line.strip()) > 2]

        return '\n'.join(cleaned_lines)

    def _extract_tables_robust(self, page) -> List[Dict[str, Any]]:
        """Extraction robuste des tableaux avec validation stricte"""
        tables = []

        try:
            # Tentative d'extraction des tableaux
            found_tables = page.find_tables(
                strategy="lines_strict",  # Plus strict pour éviter les hallucinations
                snap_tolerance=3.0
            )

            if not found_tables:
                # Tentative avec une stratégie alternative
                found_tables = page.find_tables(strategy="explicit")

            for i, table in enumerate(found_tables):
                try:
                    df = table.to_pandas()

                    # Validation stricte du tableau
                    if not self._validate_table_strict(df, page):
                        logger.debug(f"Tableau {i} rejeté - validation échoué")
                        continue

                    # Nettoyage et traitement des cellules fusionnées
                    df_cleaned = self._process_merged_cells(df)

                    # Structure finale du tableau
                    table_data = {
                        "table_id": i,
                        "bbox": table.bbox,
                        "header": df_cleaned.columns.tolist(),
                        "rows": df_cleaned.fillna("").values.tolist(),
                        "shape": df_cleaned.shape,
                        "confidence": self._calculate_table_confidence(df_cleaned)
                    }

                    tables.append(table_data)

                except Exception as e:
                    logger.warning(f"Erreur traitement tableau {i}: {str(e)}")
                    continue

        except Exception as e:
            logger.warning(f"Erreur extraction tableaux: {str(e)}")

        return tables

    def _validate_table_strict(self, df: pd.DataFrame, page) -> bool:
        """Validation stricte pour éviter les hallucinations de tableaux"""
        # 1. Vérification des dimensions
        if len(df.columns) > self.config.max_columns:
            logger.debug(f"Trop de colonnes: {len(df.columns)}")
            return False

        if len(df) < self.config.min_rows:
            logger.debug(f"Pas assez de lignes: {len(df)}")
            return False

        # 2. Vérification du taux de cellules vides
        null_ratio = df.isnull().sum().sum() / df.size
        if null_ratio > self.config.max_null_percentage:
            logger.debug(f"Trop de cellules vides: {null_ratio:.2%}")
            return False

        # 3. Vérification du contenu significatif
        text_cells = 0
        total_cells = df.size

        for col in df.columns:
            for value in df[col]:
                if pd.notna(value) and str(value).strip():
                    text_cells += 1

        content_ratio = text_cells / total_cells
        if content_ratio < self.config.min_content_ratio:
            logger.debug(f"Ratio de contenu trop faible: {content_ratio:.2%}")
            return False

        # 4. Vérification de la cohérence des colonnes
        if self._has_incoherent_columns(df):
            logger.debug("Colonnes incohérentes détectées")
            return False

        return True

    def _has_incoherent_columns(self, df: pd.DataFrame) -> bool:
        """Détecte les colonnes incohérentes (signe d'hallucination)"""
        for col in df.columns:
            # Vérification si une colonne contient principalement des fragments
            values = df[col].dropna().astype(str)
            if len(values) > 0:
                # Si plus de 80% des valeurs font moins de 3 caractères, c'est suspect
                short_values = sum(1 for v in values if len(v.strip()) < 3)
                if short_values / len(values) > 0.8:
                    return True

        return False

    def _process_merged_cells(self, df: pd.DataFrame) -> pd.DataFrame:
        """Traitement intelligent des cellules fusionnées"""
        df_copy = df.copy()

        # Forward fill pour les colonnes d'en-tête (première colonne généralement)
        if len(df_copy.columns) > 0:
            df_copy.iloc[:, 0] = df_copy.iloc[:, 0].ffill()

        # Traitement spécifique pour les tableaux de normes
        if self._is_standards_table(df_copy):
            df_copy = self._process_standards_table(df_copy)

        return df_copy

    def _is_standards_table(self, df: pd.DataFrame) -> bool:
        """Détecte si c'est un tableau de normes/valeurs limites"""
        # Recherche de mots-clés typiques
        keywords = ['polluant', 'limite', 'valeur', 'unité', 'µg/m³', 'mg/l']
        text_content = ' '.join([str(col) for col in df.columns]).lower()

        return any(keyword in text_content for keyword in keywords)

    def _process_standards_table(self, df: pd.DataFrame) -> pd.DataFrame:
        """Traitement spécialisé pour les tableaux de normes"""
        # Logic spécifique pour les tableaux de valeurs limites
        df_processed = df.copy()

        # Propagation des valeurs dans les cellules fusionnées
        for col_idx in range(len(df_processed.columns)):
            df_processed.iloc[:, col_idx] = df_processed.iloc[:, col_idx].ffill()

        return df_processed

    def _calculate_table_confidence(self, df: pd.DataFrame) -> float:
        """Calcule un score de confiance pour le tableau"""
        score = 1.0

        # Pénalité pour les cellules vides
        null_ratio = df.isnull().sum().sum() / df.size
        score -= null_ratio * 0.5

        # Bonus pour la cohérence des types de données
        for col in df.columns:
            values = df[col].dropna()
            if len(values) > 0:
                # Vérification de la cohérence des types
                numeric_count = sum(1 for v in values if str(v).replace('.', '').replace(',', '').isdigit())
                if numeric_count / len(values) > 0.7:  # Colonne majoritairement numérique
                    score += 0.1

        return max(0.0, min(1.0, score))

    def _extract_structured_text(self, page) -> Dict[str, Any]:
        """Extraction de texte structuré sans bruit excessif"""
        # Utilisation d'une approche par blocs plutôt que par span
        blocks = page.get_text("blocks")

        structured_content = {
            "title_text": "",
            "body_text": "",
            "headers": [],
            "paragraphs": []
        }

        for block in blocks:
            if len(block) >= 5:  # Structure valide
                text = block[4].strip()
                if not text:
                    continue

                # Classification basique du contenu
                if self._is_title_text(text, block):
                    structured_content["headers"].append(text)
                    if not structured_content["title_text"]:
                        structured_content["title_text"] = text
                elif len(text) > 50:  # Paragraphe substantiel
                    structured_content["paragraphs"].append(text)
                    structured_content["body_text"] += text + "\n"

        return structured_content

    def _is_title_text(self, text: str, block: tuple) -> bool:
        """Détecte si un texte est un titre"""
        # Heuristiques simples pour détecter les titres
        if len(text) > 100:  # Trop long pour être un titre
            return False

        if text.isupper() or text.istitle():
            return True

        # Détection basée sur des mots-clés
        title_keywords = ['article', 'chapitre', 'section', 'annexe', 'tableau']
        return any(keyword in text.lower() for keyword in title_keywords)

    def _detect_pollutants_enhanced(self, text: str) -> Dict[str, Dict[str, Any]]:
        """Détection améliorée des polluants avec contexte"""
        found_pollutants = {}
        text_lower = text.lower()

        for code, names in self.POLLUTANT_MAPPING.items():
            for name in names:
                if name in text_lower:
                    # Extraction du contexte autour du polluant
                    context = self._extract_context(text_lower, name)

                    found_pollutants[code] = {
                        "name": name,
                        "matched_term": name,
                        "context": context,
                        "has_limit_value": self._has_associated_limit(context)
                    }
                    break  # Un seul match par polluant

        return found_pollutants

    def _extract_context(self, text: str, term: str, window: int = 100) -> str:
        """Extrait le contexte autour d'un terme"""
        pos = text.find(term)
        if pos == -1:
            return ""

        start = max(0, pos - window)
        end = min(len(text), pos + len(term) + window)

        return text[start:end].strip()

    def _has_associated_limit(self, context: str) -> bool:
        """Vérifie si le contexte contient une valeur limite"""
        # Recherche de patterns numériques avec unités
        limit_pattern = r'\d+[\d\s,.]*\s*(µg/m³|mg/m³|mg/l|µg/l|ppm|ppb)'
        return bool(re.search(limit_pattern, context))

    def _extract_limit_values_enhanced(self, text: str) -> List[Dict[str, Any]]:
        """Extraction améliorée des valeurs limites"""
        # Pattern plus sophistiqué pour les valeurs limites
        patterns = [
            r'(\d+(?:[,.\s]\d+)*)\s*(µg/m³|mg/m³|mg/l|µg/l|ng/m³|ppm|ppb|°C|%)',
            r'(\d+(?:[,.\s]\d+)*)\s*(microgrammes?|milligrammes?|nanogrammes?)',
            r'(\d+(?:[,.\s]\d+)*)\s*(?:µg|mg|ng|ppm|ppb)'
        ]

        found_values = []

        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                value_str = match[0].replace(' ', '').replace(',', '.')
                unit = match[1] if len(match) > 1 else 'unité_non_spécifiée'

                try:
                    numeric_value = float(value_str)
                    found_values.append({
                        "raw_value": match[0],
                        "numeric_value": numeric_value,
                        "unit": unit,
                        "context": self._extract_context(text, match[0])
                    })
                except ValueError:
                    continue

        return found_values

    def _process_page_with_strategy(self, page_num: int) -> Dict[str, Any]:
        """Traite une page avec la stratégie appropriée"""
        page = self.doc.load_page(page_num)

        page_data = {
            "page_number": page_num + 1,
            "dimensions": {"width": page.rect.width, "height": page.rect.height},
            "strategy_used": "",
            "content": {},
            "confidence": 0.0
        }

        # Stratégie 1: Page scannée -> OCR
        if self._is_scanned_page(page):
            page_data["strategy_used"] = "ocr"
            ocr_text = self._ocr_page(page)

            if ocr_text:
                page_data["content"] = {
                    "text": ocr_text,
                    "pollutants": self._detect_pollutants_enhanced(ocr_text),
                    "limit_values": self._extract_limit_values_enhanced(ocr_text)
                }
                page_data["confidence"] = 0.6  # OCR moins fiable
            else:
                page_data["content"] = {"error": "Échec OCR"}
                page_data["confidence"] = 0.0

        # Stratégie 2: Page avec tableaux complexes
        elif self._has_complex_tables(page):
            page_data["strategy_used"] = "table_extraction"

            tables = self._extract_tables_robust(page)
            text_content = self._extract_structured_text(page)

            full_text = text_content["body_text"]

            page_data["content"] = {
                "tables": tables,
                "text_structure": text_content,
                "pollutants": self._detect_pollutants_enhanced(full_text),
                "limit_values": self._extract_limit_values_enhanced(full_text)
            }

            # Calcul de confiance basé sur les tableaux
            if tables:
                avg_confidence = np.mean([t["confidence"] for t in tables])
                page_data["confidence"] = avg_confidence
            else:
                page_data["confidence"] = 0.3

        # Stratégie 3: Page textuelle standard
        else:
            page_data["strategy_used"] = "text_extraction"
            text_content = self._extract_structured_text(page)
            full_text = text_content["body_text"]

            if full_text.strip():
                page_data["content"] = {
                    "text_structure": text_content,
                    "pollutants": self._detect_pollutants_enhanced(full_text),
                    "limit_values": self._extract_limit_values_enhanced(full_text)
                }
                page_data["confidence"] = 0.9
            else:
                page_data["content"] = {"error": "Aucun contenu textuel significatif"}
                page_data["confidence"] = 0.1

        return page_data

    def parse(self) -> Dict[str, Any]:
        """Analyse complète du document avec stratégies adaptatives"""
        logger.info(f"Début analyse: {self.filename}")

        doc_data = {
            "metadata": self.metadata,
            "document_type": self.doc_type,
            "filename": self.filename,
            "analysis_summary": {
                "total_pages": len(self.doc),
                "strategies_used": {},
                "avg_confidence": 0.0,
                "processing_errors": 0
            },
            "pages": []
        }

        # Traitement des pages
        confidences = []
        strategies = {}
        errors = 0

        for page_num in range(len(self.doc)):
            try:
                page_data = self._process_page_with_strategy(page_num)
                doc_data["pages"].append(page_data)

                # Collecte des statistiques
                confidences.append(page_data["confidence"])
                strategy = page_data["strategy_used"]
                strategies[strategy] = strategies.get(strategy, 0) + 1

            except Exception as e:
                logger.error(f"Erreur page {page_num + 1}: {str(e)}")
                doc_data["pages"].append({
                    "page_number": page_num + 1,
                    "error": str(e),
                    "strategy_used": "error",
                    "confidence": 0.0
                })
                errors += 1

        # Résumé de l'analyse
        doc_data["analysis_summary"].update({
            "strategies_used": strategies,
            "avg_confidence": np.mean(confidences) if confidences else 0.0,
            "processing_errors": errors
        })

        # Analyse globale des polluants
        doc_data["global_analysis"] = self._analyze_document_globally(doc_data)

        logger.info(f"Analyse terminée - Confiance: {doc_data['analysis_summary']['avg_confidence']:.2f}")

        return doc_data

    def _analyze_document_globally(self, doc_data: Dict[str, Any]) -> Dict[str, Any]:
        """Analyse globale du document"""
        global_pollutants = {}
        global_limits = []

        for page in doc_data["pages"]:
            if "content" in page and isinstance(page["content"], dict):
                # Agrégation des polluants
                if "pollutants" in page["content"]:
                    for code, info in page["content"]["pollutants"].items():
                        if code not in global_pollutants:
                            global_pollutants[code] = {
                                "name": info["name"],
                                "pages": [page["page_number"]],
                                "contexts": [info.get("context", "")],
                                "has_limits": info.get("has_limit_value", False)
                            }
                        else:
                            global_pollutants[code]["pages"].append(page["page_number"])
                            global_pollutants[code]["contexts"].append(info.get("context", ""))

                # Agrégation des valeurs limites
                if "limit_values" in page["content"]:
                    for limit in page["content"]["limit_values"]:
                        limit["page"] = page["page_number"]
                        global_limits.append(limit)

        return {
            "pollutants_summary": global_pollutants,
            "limit_values_summary": global_limits,
            "document_quality": self._assess_document_quality(doc_data),
            "extraction_recommendations": self._generate_recommendations(doc_data)
        }

    def _assess_document_quality(self, doc_data: Dict[str, Any]) -> Dict[str, Any]:
        """Évalue la qualité de l'extraction"""
        total_pages = len(doc_data["pages"])
        successful_pages = sum(1 for p in doc_data["pages"] if p.get("confidence", 0) > 0.5)

        quality_score = successful_pages / total_pages if total_pages > 0 else 0

        return {
            "overall_score": quality_score,
            "successful_pages": successful_pages,
            "total_pages": total_pages,
            "quality_level": "high" if quality_score > 0.8 else "medium" if quality_score > 0.5 else "low"
        }

    def _generate_recommendations(self, doc_data: Dict[str, Any]) -> List[str]:
        """Génère des recommandations pour améliorer l'extraction"""
        recommendations = []

        # Analyse des stratégies utilisées
        strategies = doc_data["analysis_summary"]["strategies_used"]
        errors = doc_data["analysis_summary"]["processing_errors"]

        if strategies.get("ocr", 0) > strategies.get("text_extraction", 0):
            recommendations.append("Document principalement scanné - considérer une version native si disponible")

        if errors > 0:
            recommendations.append(f"{errors} pages ont échoué - vérifier la qualité du PDF")

        if doc_data["analysis_summary"]["avg_confidence"] < 0.6:
            recommendations.append("Confiance faible - révision manuelle recommandée")

        return recommendations


def process_pdf_batch(pdf_files: List[str], config: Optional[TableValidationConfig] = None) -> Dict[str, Any]:
    """Traite un batch de fichiers PDF avec rapports détaillés"""
    results = {}
    processing_summary = {
        "total_files": len(pdf_files),
        "successful": 0,
        "failed": 0,
        "avg_confidence": 0.0,
        "processing_time": 0.0
    }

    import time
    start_time = time.time()

    for pdf_path in pdf_files:
        try:
            logger.info(f"Traitement: {pdf_path}")

            # Vérification de l'existence du fichier
            if not os.path.exists(pdf_path):
                logger.error(f"Fichier introuvable: {pdf_path}")
                results[pdf_path] = {"error": "Fichier introuvable"}
                processing_summary["failed"] += 1
                continue

            # Traitement du PDF
            parser = RobustPDFParser(pdf_path, config)
            result = parser.parse()
            results[pdf_path] = result

            # Mise à jour des statistiques
            processing_summary["successful"] += 1
            processing_summary["avg_confidence"] += result["analysis_summary"]["avg_confidence"]

            # Affichage des résultats
            doc_type = result["document_type"]
            strategies = result["analysis_summary"]["strategies_used"]
            confidence = result["analysis_summary"]["avg_confidence"]
            pollutants_count = len(result["global_analysis"]["pollutants_summary"])

            logger.info(f"✓ Succès: {result['metadata']['pages']} pages")
            logger.info(f"  Type: {doc_type}")
            logger.info(f"  Stratégies: {strategies}")
            logger.info(f"  Confiance: {confidence:.2f}")
            logger.info(f"  Polluants: {pollutants_count}")

            # Affichage des recommandations
            recommendations = result["global_analysis"]["extraction_recommendations"]
            if recommendations:
                logger.warning("  Recommandations:")
                for rec in recommendations:
                    logger.warning(f"    - {rec}")

        except Exception as e:
            logger.error(f"✗ Erreur critique: {pdf_path} - {str(e)}")
            results[pdf_path] = {"error": str(e), "traceback": traceback.format_exc()}
            processing_summary["failed"] += 1

    # Finalisation des statistiques
    processing_summary["processing_time"] = time.time() - start_time
    if processing_summary["successful"] > 0:
        processing_summary["avg_confidence"] /= processing_summary["successful"]

    return {
        "results": results,
        "summary": processing_summary
    }


def validate_parsing_setup():
    """Valide la configuration de l'environnement de parsing"""
    validation_results = {
        "tesseract_available": False,
        "tesseract_languages": [],
        "pymupdf_version": None,
        "pandas_version": None,
        "recommendations": []
    }

    # Vérification de Tesseract
    try:
        version = pytesseract.get_tesseract_version()
        validation_results["tesseract_available"] = True

        # Test des langues disponibles
        try:
            langs = pytesseract.get_languages(config='')
            validation_results["tesseract_languages"] = langs
        except:
            validation_results["tesseract_languages"] = ["eng"]  # Par défaut

    except Exception as e:
        validation_results["recommendations"].append(
            "Tesseract non trouvé - installer avec: apt-get install tesseract-ocr tesseract-ocr-fra"
        )

    # Vérification des versions des dépendances
    try:
        validation_results["pymupdf_version"] = fitz.__version__
        validation_results["pandas_version"] = pd.__version__
    except:
        pass

    # Recommandations additionnelles
    if "fra" not in validation_results["tesseract_languages"]:
        validation_results["recommendations"].append(
            "Pack français non trouvé - installer avec: apt-get install tesseract-ocr-fra"
        )

    return validation_results


# Configuration d'exemple pour différents types de documents
class DocumentConfigs:
    """Configurations prédéfinies pour différents types de documents"""

    @staticmethod
    def get_config(doc_type: str) -> TableValidationConfig:
        """Retourne une configuration optimisée selon le type de document"""
        configs = {
            "vlg_atmospherique": TableValidationConfig(
                max_columns=8,
                min_rows=3,
                max_null_percentage=0.3,
                min_content_ratio=0.4
            ),
            "vlg_liquide": TableValidationConfig(
                max_columns=10,
                min_rows=5,
                max_null_percentage=0.2,
                min_content_ratio=0.5
            ),
            "normes_air": TableValidationConfig(
                max_columns=6,
                min_rows=4,
                max_null_percentage=0.1,
                min_content_ratio=0.6
            ),
            "vls_ciment": TableValidationConfig(
                max_columns=12,
                min_rows=3,
                max_null_percentage=0.4,
                min_content_ratio=0.3
            ),
            "default": TableValidationConfig()
        }

        return configs.get(doc_type, configs["default"])


if __name__ == '__main__':
    # Validation de l'environnement
    print("="*60)
    print("VALIDATION DE L'ENVIRONNEMENT")
    print("="*60)

    validation = validate_parsing_setup()
    print(f"Tesseract disponible: {validation['tesseract_available']}")
    print(f"Langues OCR: {validation['tesseract_languages']}")
    print(f"PyMuPDF version: {validation['pymupdf_version']}")

    if validation["recommendations"]:
        print("\nRecommandations:")
        for rec in validation["recommendations"]:
            print(f"  - {rec}")

    # Configuration Tesseract (adapter selon votre installation)
    pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

    # Liste des fichiers à traiter
    pdf_files = pdf_names
    pdf_files = [f"/content{file}" for file in pdf_files]

    # Traitement avec configuration personnalisée
    print(f"\n{'='*60}")
    print("DÉBUT DU TRAITEMENT DES DOCUMENTS PDF")
    print(f"{'='*60}")

    # Configuration globale (peut être ajustée par document)
    default_config = TableValidationConfig(
        max_columns=10,
        min_rows=2,
        max_null_percentage=0.4,
        min_content_ratio=0.3
    )

    # Traitement du batch
    batch_results = process_pdf_batch(pdf_files, default_config)

    # Sauvegarde des résultats
    output_file = "robust_parsing_results.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(batch_results, f, indent=2, ensure_ascii=False, default=str)

    # Rapport final
    summary = batch_results["summary"]
    print(f"\n{'='*60}")
    print("RAPPORT FINAL")
    print(f"{'='*60}")
    print(f"Fichiers traités: {summary['total_files']}")
    print(f"Succès: {summary['successful']}")
    print(f"Échecs: {summary['failed']}")
    print(f"Confiance moyenne: {summary['avg_confidence']:.2f}")
    print(f"Temps de traitement: {summary['processing_time']:.1f}s")
    print(f"Résultats sauvegardés: {output_file}")
    print(f"{'='*60}")

    # Génération d'un rapport de qualité
    quality_report = {
        "high_quality_docs": [],
        "medium_quality_docs": [],
        "low_quality_docs": []
    }

    for file_path, result in batch_results["results"].items():
        if "error" not in result:
            quality = result["global_analysis"]["document_quality"]["quality_level"]
            filename = os.path.basename(file_path)

            if quality == "high":
                quality_report["high_quality_docs"].append(filename)
            elif quality == "medium":
                quality_report["medium_quality_docs"].append(filename)
            else:
                quality_report["low_quality_docs"].append(filename)

    print(f"\nRAPPORT DE QUALITÉ:")
    print(f"Documents haute qualité ({len(quality_report['high_quality_docs'])}): {quality_report['high_quality_docs']}")
    print(f"Documents qualité moyenne ({len(quality_report['medium_quality_docs'])}): {quality_report['medium_quality_docs']}")
    print(f"Documents basse qualité ({len(quality_report['low_quality_docs'])}): {quality_report['low_quality_docs']}")

VALIDATION DE L'ENVIRONNEMENT
Tesseract disponible: True
Langues OCR: ['eng', 'osd']
PyMuPDF version: 1.26.3

Recommandations:
  - Pack français non trouvé - installer avec: apt-get install tesseract-ocr-fra

DÉBUT DU TRAITEMENT DES DOCUMENTS PDF





RAPPORT FINAL
Fichiers traités: 17
Succès: 17
Échecs: 0
Confiance moyenne: 0.74
Temps de traitement: 39.4s
Résultats sauvegardés: robust_parsing_results.json

RAPPORT DE QUALITÉ:
Documents haute qualité (13): ['Valeurs_limites_sectorielles__céramique.pdf', 'Normes_de_la_qualité_de_lair.pdf', 'VLG_2018_des_rejets_industriels__liquides.pdf', 'valeurs_limites_spécifiques_du_secteur_cimentier.pdf', 'VLS_du_secteur_peinture_et_vernis.pdf', 'VLS__du_secteur_céramique.pdf', 'LETTRE_ROYALE.pdf', 'VLS_des_rejets_domestiques.pdf', 'decretCNE.pdf', 'VLS_du_secteur_cimentier.pdf', 'Seuils_dinformation_et_seuils_dalerte.pdf', 'VLS_du_secteur_textile.pdf', 'VLS_du__papier_cartons.pdf']
Documents qualité moyenne (2): ['Valeurs_limites_générales_des_rejets_atmosphériques.pdf', 'Normes_qualité__des_eaux_usées_épurées_detinees_a_l_irrigation.pdf']
Documents basse qualité (2): ['VLS__de_traitement_du_surface_1.pdf', 'VLS_de_lindustrie_du_sucre.pdf']


# LLM Processing

In [134]:
import google.generativeai as genai

GOOGLE_API_KEY = "AIzaSyDQ9k4tK43Un-dxkAKjKHaCzOaMrzfIoxI"
genai.configure(api_key=GOOGLE_API_KEY)

In [135]:
import os
from reportlab.lib import colors
from reportlab.lib.pagesizes import landscape ,A3 , A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image ,Flowable
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch, cm , mm
from reportlab.pdfbase.pdfmetrics import stringWidth
from reportlab.lib.enums import TA_LEFT, TA_CENTER
from reportlab.platypus import PageBreak


In [136]:
import pandas as pd
import json
import google.generativeai as genai
import re
import time
from tqdm import tqdm

# --- 1. SETUP: Configure the Gemini API and DataFrame Structure ---

# IMPORTANT: Replace "YOUR_API_KEY" with your actual Gemini API key
try:
    genai.configure(api_key=GOOGLE_API_KEY)
    print("Gemini API configured successfully.")
except Exception as e:
    print(f"Error configuring Gemini API. Please check your API key. Error: {e}")
    exit()

# Define the final 3-level hierarchical column structure
header = [
    ('Commitment Register Overview', 'Register Identifier', ''),
    ('Commitment Register Overview', 'Commitment Identifier', ''),
    ('Commitment Register Overview', 'Commitment or Obligation', ''),
    ('Commitment Register Overview', 'Description', ''),
    ('Commitment Register Overview', 'Project Phase', ''),
    ('Commitment Management', 'Potential Impact on Scope?', ''),
    ('Commitment Management', 'Status', ''),
    ('Commitment Management', 'Commitment Deadline', ''),
    ('Commitment Management', 'First Lead', ''),
    ('Commitment Management', 'Second Lead', ''),
    ('Commitment Management', 'Third Lead', ''),
    ('Commitment Management', 'Primary Commitment Documentation', ''),
    ('Commitment Management', 'Impact or Hazard Addressed', ''),
    ('Commitment Management', 'Approving Agencies', ''),
    ('Commitment Management', 'Other Stakeholders', ''),
    ('Commitment Management', 'Affected Areas or Processes', 'Preparation/construction'),
    ('Commitment Management', 'Affected Areas or Processes', 'Operation'),
    ('Commitment Management', 'Affected Areas or Processes', 'Input Management'),
    ('Commitment Management', 'Affected Areas or Processes', 'Discharge management'),
    ('Commitment Management', 'Affected Areas or Processes', 'Off-Sites'),
    ('Commitment Management', 'Affected Areas or Processes', 'Other'),
    ('Commitment Management', 'Affected Areas or Processes', 'Fungibility'),
    ('Commitment Management', 'Impact', 'CAPEX'),
    ('Commitment Management', 'Impact', 'OPEX'),
    ('Commitment Management', 'Impact', 'Health & Safety'),
    ('Commitment Management', 'Impact', 'Social'),
    ('Commitment Management', 'Impact', 'Economic'),
    ('Commitment Management', 'Impact', 'Environmental'),
    ('Commitment Management', 'Impact', 'Regulatory'),
    ('Commitment Management', 'Comments', ''),
    ('Commitment Management', 'Requires Change Order?', '')
]
columns = pd.MultiIndex.from_tuples(header)


# --- 2. DATA PREPARATION: Functions to load and search the knowledge base ---

def load_and_prepare_knowledge_base(json_file_path):
    # This function remains unchanged
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file {json_file_path} was not found.")
        return None
    knowledge_base = []
    for doc_path, doc_data in data['results'].items():
        full_text = ""
        for page in doc_data.get('pages', []):
            if 'content' in page and 'text_structure' in page['content']:
                full_text += page['content']['text_structure'].get('title_text', '') + "\n"
                full_text += page['content']['text_structure'].get('body_text', '') + "\n"
        if full_text:
            knowledge_base.append({
                "filename": doc_data.get('filename', 'N/A'),
                "document_type": doc_data.get('document_type', 'N/A'),
                "content": full_text.lower()
            })
    print(f"Knowledge base created with {len(knowledge_base)} documents.")
    return knowledge_base

def find_relevant_documents(commitment_description, knowledge_base):
    # This function remains unchanged
    commitment_description = commitment_description.lower()
    keywords = set(re.findall(r'\b[a-zA-Zçéàèùâêîôûæœ\d]{4,}\b', commitment_description))
    relevant_texts = []
    for doc in knowledge_base:
        if any(keyword in doc['filename'].lower() for keyword in keywords) or \
           any(keyword in doc['content'] for keyword in keywords):
            relevant_texts.append(f"--- START OF RELEVANT DOCUMENT ({doc['filename']}) ---\n{doc['content']}\n--- END OF DOCUMENT ---\n")
    return "\n".join(relevant_texts)


# --- 3. CORE LOGIC: Updated function to call Gemini with all context ---

def call_gemini_to_complete_row(commitment_row, relevant_texts, project_description):
    """Crafts the prompt with all context and calls the Gemini API."""
    model = genai.GenerativeModel('gemini-2.5-flash')

    commitment_desc = commitment_row[('Commitment Register Overview', 'Description', '')]
    commitment_id = commitment_row[('Commitment Register Overview', 'Commitment Identifier', '')]

    prompt = f"""
    You are an expert Moroccan environmental and project management compliance analyst.
    Your task is to complete a row in a project's Commitment Register by synthesizing information from three sources: the project's description, the specific commitment, and relevant legal documents.

    **SOURCE 1: PROJECT DESCRIPTION**
    {project_description}

    **SOURCE 2: COMMITMENT CONTEXT**
    - Commitment ID: "{commitment_id}"
    - Commitment Description: "{commitment_desc}"

    **SOURCE 3: LEGAL EVIDENCE (Relevant Moroccan regulations and laws)**
    {relevant_texts}

    **TASK:**
    Based on ALL THREE sources provided, analyze how the commitment relates to the project phase, its objectives, and the legal requirements. Then, fill in the following fields. If information is not available, return an empty string "". Be concise and accurate. Output ONLY a valid JSON object, with no other text or markdown.

    **OUTPUT FORMAT (JSON ONLY):**
    {{
      "Impact or Hazard Addressed": "Identify the specific risk or hazard. Example: 'Risk of air pollution from emissions exceeding legal limits during the operational phase.'",
      "Approving Agencies": "List the relevant government bodies mentioned. Example: 'Ministry of Energy Transition, Authorities coordinated by the Customer'",
      "Comments": "Provide a brief analysis connecting the commitment to the law and project phase. Mention specific limit values if found. Example: 'As the project is in the FEED phase, this commitment ensures compliance with Law 13-03 is designed in from the start. VLG for SO2 is 500 mg/m3.'",
      "Affected_Preparation_Construction": "Enter 'x' if relevant, otherwise ''",
      "Affected_Operation": "Enter 'x' if relevant, otherwise ''",
      "Affected_Discharge_Management": "Enter 'x' if relevant, otherwise ''",
      "Impact_Health_Safety": "Enter 'x' if relevant, otherwise ''",
      "Impact_Environmental": "Enter 'x' if relevant, otherwise ''",
      "Impact_Regulatory": "Enter 'x' if relevant, otherwise ''"
    }}
    """

    try:
        response = model.generate_content(prompt)
        cleaned_response = response.text.strip().lstrip("```json").rstrip("```")
        return json.loads(cleaned_response)
    except json.JSONDecodeEror:
        tqdm.write(f"Error: Gemini returned a non-JSON response for '{commitment_id}':\n{response.text}")
        return None
    except Exception as e:
        tqdm.write(f"An unexpected error occurred with Gemini API for '{commitment_id}': {e}")
        return None


# --- 4. MAIN EXECUTION SCRIPT ---

if __name__ == "__main__":
    # Define the static project description context
    project_description_context = """
    JESA has entered a reimbursable Work Order for elaborating the FEED (Evaluate+ Define) for this project.
    This revision of the work order will include the remaining activities and deliverables required to complete Evaluate phase and launch critical packages. It includes also ESIA preparation, architectural activities and deliverables for non-process building and master plan.
    Division of Responsibilities: JESA has an EPCM reimbursable scope, however, the Customer still has some responsibilities including coordination with authorities and OCP entities.
    Project Objectives: The key objectives of the current phase are Phase 2 (Evaluate) development study and a Class 4 estimate (+/- 20% to +/- 30%). Activities added to the evaluate phase for fast-tracking include: Civil early works, Geo scan, Storage building, Environmental deliverables, and Mechanical ITBs for LLIs.
    """

    knowledge_base = load_and_prepare_knowledge_base('robust_parsing_results.json')
    if knowledge_base is None:
        exit()

    initial_commitments_data = [
        ["Moroccan environmental regulation", "Law n° 13-03 relating to the fight against air pollution", "Legal obligation", "Controlling the atmospheric emissions during industrial operations while ensuring good air quality.", "Design/Operation", "High", "In Progress", "During operational phase", "HSE Client", "Environment Client", "", "Environmental Report", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
        ["OCP Group objectives and commitment", "Liquid effluents policy", "Commitment", "Complying with legal regulatory national and international requirements for liquid discharge to ensure the prevention and control of related environmental risks.", "Design/Construction/Operation", "High", "In Progress", "During Design, construction and operation phases", "Process Engineering", "Environmental Engineering", "Civil Engineering", "Environmental Design criteria; Liquid effluents policy (OCP)", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
        ["OCP Group objectives and commitment", "Waste management policy", "Commitment", "Responsibly manage the waste generated by the project, respecting national and internationally recognized guidelines.", "Design/Construction/Operation", "High", "In Progress", "During design Construction and operation phases", "Environmental Engineering", "HSE Client", "Environment-Construction Contractors", "Waste management plan", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]
    ]
    df_initial = pd.DataFrame(initial_commitments_data, columns=columns)
    df_final = df_initial.copy()

    # Wrap the loop with tqdm for a progress bar
    for index, row in tqdm(df_initial.iterrows(), total=len(df_initial), desc="Analyzing Commitments"):
        commitment_description = row[('Commitment Register Overview', 'Description', '')]

        relevant_texts = find_relevant_documents(commitment_description, knowledge_base)

        if not relevant_texts:
            tqdm.write(f"  -> No relevant documents for '{row[('Commitment Register Overview', 'Commitment Identifier', '')]}'. Skipping.")
            continue

        # Call the updated function with the project description
        extracted_data = call_gemini_to_complete_row(row, relevant_texts, project_description_context)

        time.sleep(2) # Respect API rate limits

        if extracted_data:
            df_final.loc[index, ('Commitment Management', 'Impact or Hazard Addressed', '')] = extracted_data.get("Impact or Hazard Addressed", "")
            df_final.loc[index, ('Commitment Management', 'Approving Agencies', '')] = extracted_data.get("Approving Agencies", "")
            df_final.loc[index, ('Commitment Management', 'Comments', '')] = extracted_data.get("Comments", "")
            df_final.loc[index, ('Commitment Management', 'Affected Areas or Processes', 'Preparation/construction')] = extracted_data.get("Affected_Preparation_Construction", "")
            df_final.loc[index, ('Commitment Management', 'Affected Areas or Processes', 'Operation')] = extracted_data.get("Affected_Operation", "")
            df_final.loc[index, ('Commitment Management', 'Affected Areas or Processes', 'Discharge management')] = extracted_data.get("Affected_Discharge_Management", "")
            df_final.loc[index, ('Commitment Management', 'Impact', 'Health & Safety')] = extracted_data.get("Impact_Health_Safety", "")
            df_final.loc[index, ('Commitment Management', 'Impact', 'Environmental')] = extracted_data.get("Impact_Environmental", "")
            df_final.loc[index, ('Commitment Management', 'Impact', 'Regulatory')] = extracted_data.get("Impact_Regulatory", "")
        else:
            tqdm.write(f"  -> Failed to get a valid response from Gemini for '{row[('Commitment Register Overview', 'Commitment Identifier', '')]}'.")

    print("\n\n### --- FINAL DYNAMICALLY COMPLETED COMMITMENT REGISTER --- ###")
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2200)
    display(df_final)

Gemini API configured successfully.
Knowledge base created with 13 documents.


Analyzing Commitments:   0%|          | 0/3 [00:00<?, ?steps/s]

  -> No relevant documents for 'Law n° 13-03 relating to the fight against air pollution'. Skipping.


Analyzing Commitments: 100%|██████████| 3/3 [00:28<00:00,  9.63s/steps]



### --- FINAL DYNAMICALLY COMPLETED COMMITMENT REGISTER --- ###





Unnamed: 0_level_0,Commitment Register Overview,Commitment Register Overview,Commitment Register Overview,Commitment Register Overview,Commitment Register Overview,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management,Commitment Management
Unnamed: 0_level_1,Register Identifier,Commitment Identifier,Commitment or Obligation,Description,Project Phase,Potential Impact on Scope?,Status,Commitment Deadline,First Lead,Second Lead,Third Lead,Primary Commitment Documentation,Impact or Hazard Addressed,Approving Agencies,Other Stakeholders,Affected Areas or Processes,Affected Areas or Processes,Affected Areas or Processes,Affected Areas or Processes,Affected Areas or Processes,Affected Areas or Processes,Affected Areas or Processes,Impact,Impact,Impact,Impact,Impact,Impact,Impact,Comments,Requires Change Order?
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Preparation/construction,Operation,Input Management,Discharge management,Off-Sites,Other,Fungibility,CAPEX,OPEX,Health & Safety,Social,Economic,Environmental,Regulatory,Unnamed: 30_level_2,Unnamed: 31_level_2
0,Moroccan environmental regulation,Law n° 13-03 relating to the fight against air...,Legal obligation,Controlling the atmospheric emissions during i...,Design/Operation,High,In Progress,During operational phase,HSE Client,Environment Client,,Environmental Report,,,,,,,,,,,,,,,,,,,
1,OCP Group objectives and commitment,Liquid effluents policy,Commitment,Complying with legal regulatory national and i...,Design/Construction/Operation,High,In Progress,"During Design, construction and operation phases",Process Engineering,Environmental Engineering,Civil Engineering,Environmental Design criteria; Liquid effluent...,Risk of water pollution from liquid effluents/...,"Ministry of Environment (via the CNE), Nationa...",,x,x,,x,,,,,,x,,,x,x,As the project is in the FEED/Evaluate phase a...,
2,OCP Group objectives and commitment,Waste management policy,Commitment,Responsibly manage the waste generated by the ...,Design/Construction/Operation,High,In Progress,During design Construction and operation phases,Environmental Engineering,HSE Client,Environment-Construction Contractors,Waste management plan,Risk of uncontrolled waste generation leading ...,"Ministry of Environment (via CNE presidency), ...",,x,x,,x,,,,,,x,,,x,x,This commitment aligns with Morocco's foundati...,


In [137]:
# @title Commitment Table
import pandas as pd
from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle, Flowable
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.pagesizes import landscape, A3
from reportlab.lib.units import inch
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT
from reportlab.pdfbase.pdfmetrics import stringWidth

# --- Your Custom VerticalText Class (Unchanged) ---
class VerticalText(Flowable):
    """A custom flowable to draw text rotated by 90 degrees."""
    def __init__(self, text, font_name='Helvetica-Bold', font_size=6):
        Flowable.__init__(self)
        self.text = text
        self.font_name = font_name
        self.font_size = font_size

    def draw(self):
        canvas = self.canv
        canvas.saveState()
        canvas.setFont(self.font_name, self.font_size)
        canvas.rotate(90)
        canvas.drawString(5, -self.font_size - 2, self.text)
        canvas.restoreState()

    def wrap(self, available_width, available_height):
        text_width = stringWidth(self.text, self.font_name, self.font_size)
        return (self.font_size + 4, text_width + 10)

def generate_commitment_register_pdf(df, output_filename="commitment_register_final.pdf"):
    """
    Generates a PDF file with a detailed table dynamically from a DataFrame
    with a 3-level MultiIndex header.
    """
    pagesize = landscape(A3)
    doc = SimpleDocTemplate(
        output_filename,
        pagesize=pagesize,
        rightMargin=0.5 * inch,
        leftMargin=0.5 * inch,
        topMargin=1.3 * inch,
    )

    table_width = doc.width

    # --- Paragraph Styles for Cell Content (Your Styles) ---
    styles = getSampleStyleSheet()
    cell_style = ParagraphStyle(
        name='CellStyle',
        parent=styles['Normal'],
        alignment=TA_CENTER,
        fontSize=6,
        leading=8,
        spaceAfter=2,
        spaceBefore=2
    )
    header_style = ParagraphStyle(
        name='HeaderStyle',
        parent=styles['Normal'],
        fontName='Helvetica-Bold',
        fontSize=7,
        leading=8,
        alignment=TA_CENTER,
        textColor=colors.white,
    )

    # --- Helper Functions ---
    def create_header_paragraph(text, style=header_style):
        return Paragraph(text.replace('\n', '<br/>'), style)

    def create_data_paragraph(text, style=cell_style):
        if not isinstance(text, str):
            text = str(text)
        return Paragraph(text.replace('\n', '<br/>').replace('* ', '• '), style)

    # --- DYNAMIC HEADER GENERATION ---
    # This block replaces your hardcoded headers.

    header_row_1 = []
    header_row_2 = []
    header_row_3 = []

    # Track headers to avoid duplicates in a row
    last_h1 = None
    last_h2 = None

    for l0, l1, l2 in df.columns:
        # Level 0 Header (e.g., 'Commitment Management')
        if l0 != last_h1:
            header_row_1.append(create_header_paragraph(l0))
            last_h1 = l0
        else:
            header_row_1.append('')

        # Level 1 Header (e.g., 'Affected Areas or Processes')
        if l1 != last_h2:
            header_row_2.append(create_header_paragraph(l1))
            last_h2 = l1
        else:
            header_row_2.append('')

        # Level 2 Header (e.g., the vertical text)
        if l2: # Only add if Level 2 exists
            header_row_3.append(VerticalText(l2))
        else:
            header_row_3.append('')

    # --- DYNAMIC DATA ROW GENERATION ---
    # This block replaces your hardcoded data_rows.
    data_rows = []
    for index, row in df.iterrows():
        pdf_row = [create_data_paragraph(cell) for cell in row]
        data_rows.append(pdf_row)

    table_data = [header_row_1, header_row_2, header_row_3] + data_rows

    dynamic_styles = []

    level0_headers = df.columns.get_level_values(0)
    start_col = 0
    for i in range(1, len(level0_headers)):
        if level0_headers[i] != level0_headers[i-1]:
            dynamic_styles.append(('SPAN', (start_col, 0), (i - 1, 0)))
            start_col = i
    dynamic_styles.append(('SPAN', (start_col, 0), (len(level0_headers) - 1, 0)))

    level1_headers = df.columns.droplevel(2)
    start_col = 0
    for i in range(1, len(level1_headers)):
        if level1_headers[i] != level1_headers[i-1]:
            dynamic_styles.append(('SPAN', (start_col, 1), (i - 1, 1)))
            start_col = i
    dynamic_styles.append(('SPAN', (start_col, 1), (len(level1_headers) - 1, 1)))

    for i, (l0, l1, l2) in enumerate(df.columns):
        if not l1 and not l2: # This is a simple, non-nested column
            dynamic_styles.append(('SPAN', (i, 0), (i, 2)))
        elif l1 and not l2: # Spans row 1 and 2
            dynamic_styles.append(('SPAN', (i, 1), (i, 2)))

    col_widths_proportions = [
        0.05, 0.05, 0.04, 0.12, 0.04, 0.04, 0.03, 0.04, 0.04, 0.04, 0.04, 0.08,
        0.06, 0.06, 0.04,
        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, # Affected Areas
        0.012, 0.012, 0.012, 0.012, 0.012, 0.012, 0.012, # Impact
        0.06, 0.033
    ]
    col_widths = [p * table_width for p in col_widths_proportions]

    # --- COMBINED TABLE STYLE ---
    # Combine your static styles with our new dynamic SPAN styles
    final_style = TableStyle([
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ('GRID', (0, 0), (-1, -1), 0.5, colors.black),

        # Header Row 1 (Top)
        ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#002060')),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),

        # Header Rows 2 & 3
        ('BACKGROUND', (0, 1), (-1, 2), colors.HexColor('#C00000')),
        ('TEXTCOLOR', (0, 1), (-1, 2), colors.white),
        ('FONTNAME', (0, 1), (-1, 2), 'Helvetica-Bold'),
    ] + dynamic_styles) # Add the dynamically calculated SPANs

    table = Table(table_data, colWidths=col_widths, repeatRows=3)
    table.setStyle(final_style)
    story = generate_commitment_register_second(output_filename)
    doc.build( story +[PageBreak()]+ [table] , onFirstPage= header_footer , onLaterPages=header_footer)
    print(f"PDF successfully generated: {output_filename}")


# --- HOW TO USE THIS FUNCTION ---
if __name__ == '__main__':
    # This block is for demonstration. You would use your actual `df_final`.

    # Re-create the structure of your df_final for a runnable example
    header = [
        ('Commitment Register Overview', 'Register Identifier', ''),
        ('Commitment Register Overview', 'Commitment Identifier', ''),
        ('Commitment Register Overview', 'Commitment or Obligation', ''),
        ('Commitment Register Overview', 'Description', ''),
        ('Commitment Register Overview', 'Project Phase', ''),
        ('Commitment Management', 'Potential Impact on Scope?', ''),
        ('Commitment Management', 'Status', ''),
        ('Commitment Management', 'Commitment Deadline', ''),
        ('Commitment Management', 'First Lead', ''),
        ('Commitment Management', 'Second Lead', ''),
        ('Commitment Management', 'Third Lead', ''),
        ('Commitment Management', 'Primary Commitment Documentation', ''),
        ('Commitment Management', 'Impact or Hazard Addressed', ''),
        ('Commitment Management', 'Approving Agencies', ''),
        ('Commitment Management', 'Other Stakeholders', ''),
        ('Commitment Management', 'Affected Areas or Processes', 'Preparation/construction'),
        ('Commitment Management', 'Affected Areas or Processes', 'Operation'),
        ('Commitment Management', 'Affected Areas or Processes', 'Input Management'),
        ('Commitment Management', 'Affected Areas or Processes', 'Discharge management'),
        ('Commitment Management', 'Affected Areas or Processes', 'Off-Sites'),
        ('Commitment Management', 'Affected Areas or Processes', 'Other'),
        ('Commitment Management', 'Affected Areas or Processes', 'Fungibility'),
        ('Commitment Management', 'Impact', 'CAPEX'),
        ('Commitment Management', 'Impact', 'OPEX'),
        ('Commitment Management', 'Impact', 'Health & Safety'),
        ('Commitment Management', 'Impact', 'Social'),
        ('Commitment Management', 'Impact', 'Economic'),
        ('Commitment Management', 'Impact', 'Environmental'),
        ('Commitment Management', 'Impact', 'Regulatory'),
        ('Commitment Management', 'Comments', ''),
        ('Commitment Management', 'Requires Change Order?', '')
    ]
generate_commitment_register_pdf(df_final)



PDF successfully generated: commitment_register_final.pdf


In [138]:
# @title Header
def header_footer(canvas, doc):
    jesa_blue = colors.Color(red=0/255, green=51/255, blue=102/255)

    header_offset = 0
    canvas.saveState()
    page_w, page_h = landscape(A3)

    # Shift the header downward by subtracting offset
    line_y = page_h - doc.topMargin + 2.15 * cm - header_offset
    canvas.setStrokeColor(jesa_blue)
    canvas.setLineWidth(1)
    canvas.line(doc.leftMargin, line_y, page_w - doc.rightMargin, line_y)

    # --- Top separator line ---
    line_y = page_h - doc.topMargin + 0.5 * cm - header_offset
    canvas.setStrokeColor(jesa_blue)
    canvas.setLineWidth(1)
    canvas.line(doc.leftMargin, line_y, page_w - doc.rightMargin, line_y)

    # --- Logo ---
    logo_path = 'jesa_logo.png'
    logo_w = 4.0 * cm
    logo_h = 3.0 * cm
    logo_y = line_y + 0.2 * cm
    if os.path.exists(logo_path):
        logo = Image(logo_path, width=logo_w, height=logo_h)
        logo.drawOn(canvas, doc.leftMargin, logo_y)
    else:
        canvas.setFont('Helvetica-Bold', 30)
        canvas.setFillColor(jesa_blue)
        canvas.drawString(doc.leftMargin, logo_y + 0.4*cm, "JESA")

    # --- Left table ---
    info_y = line_y - 0.2 * cm
    left_data = [
        ['Project Name:', 'Chemical additives plant'],
        ['Customer:',     'NOVADDIX'],
        ['Document Title:', 'Sustainable Project Delivery - Legal Register - Chemical additives plant']
    ]
    left_col_w = [(doc.width - logo_w - 0.5*cm)*0.1,
                  (doc.width - logo_w - 0.5*cm)*0.8]

    left_tbl = Table(left_data, colWidths=left_col_w)
    left_tbl.setStyle(TableStyle([
        ('FONTNAME',    (0,0), (0,-1),   'Helvetica-Bold'),
        ('FONTNAME',    (1,0), (1,-1),   'Helvetica'),
        ('FONTSIZE',    (0,0), (-1,-1),  6),
        ('VALIGN',      (0,0), (-1,-1), 'TOP'),
        ('LEFTPADDING', (0,0), (-1,-1), 0),
        ('RIGHTPADDING',(0,0), (-1,-1), 0),
    ]))
    left_x = doc.leftMargin + logo_w + 0.5 * cm
    left_tbl.wrapOn(canvas, doc.width, doc.topMargin)
    left_tbl.drawOn(canvas, left_x, info_y)

    # --- Right table ---
    right_data = [
        ['Q37440-00-EN-REG-00001'],
        ['REV A'],
        ['Page %d' % canvas.getPageNumber()]
    ]
    right_col_w = 3 * cm
    right_tbl = Table(right_data, colWidths=[right_col_w])
    right_tbl.setStyle(TableStyle([
        ('FONTNAME',    (0,0), (-1,-1),   'Helvetica-Bold'),
        ('FONTNAME',    (0,1), (0,1),     'Helvetica-Bold'),
        ('FONTSIZE',    (0,0), (-1,-1),    6),
        ('ALIGN',       (0,0), (-1,-1), 'RIGHT'),
        ('LEFTPADDING', (0,0), (-1,-1), 0),
        ('RIGHTPADDING',(0,0), (-1,-1), 0),
    ]))
    right_x = page_w - doc.rightMargin - right_col_w
    right_tbl.wrapOn(canvas, doc.width, doc.topMargin)
    right_tbl.drawOn(canvas, right_x, info_y)

    canvas.restoreState()


In [139]:
# @title Second page

def generate_commitment_register_second(output_filename):
    page_width, page_height = landscape(A3)

    # Setup document with balanced side margins
    doc = SimpleDocTemplate(
        output_filename,
        pagesize=landscape(A3),
        rightMargin=1.2 * inch,
        leftMargin = 1.2* inch,
        topMargin=2.25 * inch,
        bottomMargin=0.75 * inch,
    )

    # Use full content width for tables
    usable_width = doc.width

    story = []
    styles = getSampleStyleSheet()

    # Custom color for JESA blue
    jesa_blue = colors.Color(red=0/255, green=51/255, blue=102/255)


    # Main heading style
    styles.add(ParagraphStyle(
        name='MainHeading',
        fontName='Helvetica-Bold',
        fontSize=8,  # Maintained as specified
        leading=10,   # Tightened line spacing
        textColor=jesa_blue,
        spaceBefore=8,  # Added space above heading
        spaceAfter=4    # Space below heading
    ))

    # Sub-heading style
    styles.add(ParagraphStyle(
        name='SubHeading',
        fontName='Helvetica-Bold',
        fontSize=7,   # Maintained as specified
        leading=4,    # Tightened line spacing
        textColor=colors.black,
        spaceBefore=6,  # Space above subheading
        spaceAfter=0    # Space below subheading
    ))

    # Body text style
    body_style = styles['BodyText']
    body_style.fontName = 'Helvetica'
    body_style.fontSize = 7  # Maintained as specified
    body_style.leading = 10  # Tightened line spacing
    body_style.alignment = 4  # Justified text
    body_style.spaceAfter = 4  # Reduced space after paragraph

    # List item style
    styles.add(ParagraphStyle(
        name='ListItem',
        parent=body_style,
        leftIndent=0 * inch,
        spaceBefore=0,
        spaceAfter=0  # Space between list items
    ))
    # --- Build document content ---
    story.append(Paragraph("1. Main purpose", styles['MainHeading']))
    story.append(Paragraph(
        """The Commitment Register is a system used to ensure commitments are incorporated into the appropriate part of engineering design, construction, procurement and/or operations, as required. Each commitment will be "closed out" in the Register before project phase completion, indicating that the commitment has been responsibly managed. A final Commitment Report is provided to the Customer at project phase completion outlining the inclusion of commitments into the various project documents and which commitments are compliant.""",
        styles['BodyText']
    ))

    story.append(Paragraph("2. Definition", styles['MainHeading']))
    story.append(Paragraph(
        "An obligation is a requirement, under the law, necessary for compliance. Obligations and compliance are managed as part of Technical Integrity under SEAl. A commitment is a voluntary statement of action, or a goal, that goes beyond legal requirements. The Commitment Register for a project or contract lists the commitments made by the Customer in corporate or publicly available documentation. Typical sources include the Environmental Impact Assessment (EIA), Project Registers/Application or material published for the public in newspapers, open houses, etc.",
        styles['BodyText']
    ))
    story.append(Paragraph(
        "The Commitment Register is a central place to document, communicate, and track the commitments so they will be understood and included in the project. This Commitment Register is part of SEAl Sustainable Design Planning, which is described in the SEAl Standard (MS-E9-STD-00017). The Commitment Register should be discussed with the Customer before use on a project or contract as part of SEAl Alignment, including how commitments are to be recorded and managed while executing a project.",
        styles['BodyText']
    ))
    story.append(Paragraph(
        "As the project progresses, commitments may become obsolete or may not be feasible to implement within the project. The Commitment Register is used to track the status of all commitments including rationale for those commitments that become obsolete or are not feasible. These changes in status are tracked in the Commitment Register.",
        styles['BodyText']
    ))

    story.append(Paragraph("3. Initiation", styles['MainHeading']))
    story.append(Paragraph("Initiating and Customizing the Commitment Register", styles['SubHeading']))
    story.append(Paragraph("The Project Manager / Project Engineering Manager or designate, shall:", styles['BodyText']))
    story.append(Paragraph("- work with the Customer to populate the Register and classify the commitments.", styles['ListItem']))
    story.append(Paragraph("- be responsible for ensuring commitments are registered and communicated to the appropriate party (e.g. the discipline lead responsible for incorporating a given commitment within the project scope of work).", styles['ListItem']))
    story.append(Paragraph("The Commitment Register is designed to be customizable to suit the project's commitment tracking needs. Columns such as 'Affected areas or processes' should be customized to reflect the project.", styles['BodyText']))

    story.append(Paragraph("Register Maintenance", styles['SubHeading']))
    story.append(Paragraph("The Project Manager, Project Engineering Manager or designate, shall work with the Discipline Leads to maintain an accurate status of each commitment on the register. The register shall be updated as needed and controlled properly so only the most recent version is available to the project team. Sufficient hours shall be included in the project budget for register maintenance.", styles['BodyText']))
    story.append(Paragraph("Technical Review", styles['SubHeading']))
    story.append(Paragraph("The Commitment Register shall be reviewed by the Project Management Team and approved by the Customer at an agreed frequency for the project. After each review and approval the signed Commitment Register shall be converted to PDF and saved while updates continue in the live register.", styles['BodyText']))
    story.append(Paragraph("Other Considerations", styles['SubHeading']))
    story.append(Paragraph("The commitments and other registers (Legal, Sustainable Solutions Database), are normally created in conjunction with the Sustainability Steering Committee (SSC). The SSC is comprised of sustainability stakeholders from the customer (e.g. public relations, environmental advisors, regulatory contacts, operations manager) and JESA (e.g. Sustainability Lead, environmental scientists).", styles['BodyText']))

    story.append(Paragraph("4. References", styles['MainHeading']))
    for ref in [
        "Safe and Sustainable Engineering for Asset Lifecycle (SEAL) Standard (MS-E9-STD-00017)",
        "Sustainable Project Delivery - Legal Register (MS-E9-TEM-00053)",
        "Sustainable Solutions Standard (MS-FM-STD-00158)",
    ]:
        story.append(Paragraph(ref, styles['ListItem']))
    story.append(Spacer(1, 0.2 * inch))

    story.append(Paragraph("5. Abbreviations", styles['MainHeading']))
    final_table_data = [
        ['ABH', 'Agence du Bassin Hydraulique', 'EHS', 'Environment, Health & Safety'],
        ['BAT', 'Best Available Technologies', 'HR', 'Human Resources'],
        ['CRI', 'Centre Régional d\'Investissement', 'IASE', 'Health Safety & Environment'],
        ['ONG', 'Organisation Non Gouvernementale', 'PSE', 'Programme de Suivi et de Surveillance Environnemental'],
        ['OCP', 'Office Chérifien des Phosphates', 'SDG', 'Sustainable Development Goals'],
        ['', '', 'SEAL', 'Safe and Sustainable Engineering for Asset Lifecycle'],
    ]
    abbrev_col_widths = [usable_width * 0.10, usable_width * 0.35, usable_width * 0.10, usable_width * 0.45]
    abbreviations_table = Table(final_table_data, colWidths=abbrev_col_widths)
    abbreviations_table.setStyle(TableStyle([
        ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
        ('FONTSIZE', (0, 0), (-1, -1), 6),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
        ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
        ('FONTNAME', (2, 0), (2, -1), 'Helvetica-Bold'),
        ('LEFTPADDING', (0,0), (-1,-1), 5),
        ('RIGHTPADDING', (0,0), (-1,-1), 5),
        ('TOPPADDING', (0,0), (-1,-1), 3),
        ('BOTTOMPADDING', (0,0), (-1,-1), 3),
    ]))
    story.append(abbreviations_table)
    return story


In [140]:
# @title Cover page
def generate_commitment_register_cover_page (PDF_PATH = "commitment_register_cover_page.pdf"):

  LOGO_PATH = "jesa_logo.png"
  PAGE_WIDTH, PAGE_HEIGHT = A4

  # --- 2. Create a Dummy Logo if it doesn't exist ---
  # This helps the script run even if the logo file is missing.
  if not os.path.exists(LOGO_PATH):
      try:
          from PIL import Image as PILImage, ImageDraw, ImageFont
          img = PILImage.new('RGB', (240, 70), color='white')
          d = ImageDraw.Draw(img)
          # Use a common bold font if available, otherwise default
          try:
              font = ImageFont.truetype("arialbd.ttf", 50)
          except IOError:
              font = ImageFont.load_default()
          d.text((10, 5), "JESA", fill=colors.HexColor("#1F497D"), font=font)
          img.save(LOGO_PATH)
          print(f"Created a dummy logo: {LOGO_PATH}")
      except Exception as e:
          print(f"Warning: Could not create a dummy logo. Please provide {LOGO_PATH}. Error: {e}")

  # --- 3. Document Setup ---
  doc = SimpleDocTemplate(
          PDF_PATH,
          pagesize=A4,
          rightMargin=0.5 * inch,
          leftMargin = 0.5* inch,
          topMargin=0.5 * inch,
          bottomMargin=0.75 * inch,
      )
  elements = []
  # Calculate the available width for content on the page
  content_width = PAGE_WIDTH - doc.leftMargin - doc.rightMargin

  # --- 4. Define Paragraph Styles ---
  # Style for the main title in the top blue bar
  header_style = ParagraphStyle(
      name="Header",
      fontName="Helvetica-Bold",
      fontSize=12,
      textColor=colors.white,
      leading=16,
      leftIndent=10
  )

  # Style for the "Purpose of this register..." text
  subheader_style = ParagraphStyle(
      name="Subheader",
      fontName="Helvetica",
      fontSize=7,
      textColor=colors.black,
      leading=12,
      spaceAfter=6,
  )

  # Style for the blue field labels (e.g., "PROJECT No:")
  label_style = ParagraphStyle(
      name="Label",
      fontName="Helvetica-Bold",
      fontSize=6,
      textColor=colors.white,
      leftIndent=4,
      leading=12,
  )

  # Style for the text inside the value boxes (e.g., "Q37440")
  value_style = ParagraphStyle(
      name="Value",
      fontName="Helvetica-bold",
      fontSize=6,
      textColor=colors.black,
      leading=12,
  )

  # Style for table text
  table_style = ParagraphStyle(
      name="TableText",
      fontName="Helvetica",
      fontSize=8,
      textColor=colors.black,
      leading=10,
      alignment=1,  # Center alignment
  )

  table_header_style = ParagraphStyle(
      name="TableHeader",
      fontName="Helvetica-Bold",
      fontSize=8,
      textColor=colors.white,
      leading=10,
      alignment=1,  # Center alignment
  )

  # --- 5. Build Combined Header and Purpose Box ---
  # This single table creates the continuous border effect.
  title_para = Paragraph(
      "Sustainable Project Delivery - Legal Register - Chemical additives plant",
      header_style
  )
  logo_img = Image(LOGO_PATH, width=80, height=25)
  purpose_para = Paragraph(
      "Purpose of this register is to record the regulatory requirements that need to be complied with by the project. "
      "The register provides traceability of the action that has been taken to address the requirement.",
      subheader_style
  )

  # The table has two rows: one for the header, one for the purpose text.
  combined_header_table = Table(
      [
          [title_para, logo_img],     # First row: Title and Logo
          [purpose_para, None]        # Second row: Purpose text (spans both columns)
      ],
      colWidths=[content_width - 88, 88],
      rowHeights=[12*mm, None] # First row has fixed height, second is auto
  )

  combined_header_table.setStyle(TableStyle([
      # Span the purpose cell across the whole width
      ('SPAN', (0, 1), (1, 1)),

      # Background Colors
      ('BACKGROUND', (0, 0), (0, 0), colors.HexColor("#1F497D")), # Blue for title
      ('BACKGROUND', (1, 0), (1, 0), colors.white),             # White for logo

      # Alignment
      ('VALIGN', (0, 0), (-1, 0), 'MIDDLE'), # Middle-align the header row
      ('ALIGN', (1, 0), (1, 0), 'CENTER'),   # Center the logo

      # Borders and Lines
      ('BOX', (0, 0), (-1, -1), 1, colors.black), # Main outer border
      ('LINEBELOW', (0, 0), (-1, 0), 1, colors.black), # Line under the header
      ('LINEBEFORE', (1, 0), (1, 0), 1, colors.black), # Vertical line next to logo

      # Padding for the purpose text cell
      ('TOPPADDING', (0, 1), (-1, 1), 5),
      ('BOTTOMPADDING', (0, 1), (-1, 1), 5),
      ('LEFTPADDING', (0, 1), (-1, 1), 5),
      ('RIGHTPADDING', (0, 1), (-1, 1), 5),
  ]))

  elements.append(combined_header_table)
  elements.append(Spacer(1, 8*mm))

  # --- 6. Build Project Detail Fields ---
  fields = [
      ("PROJECT No:", "Q37440"),
      ("PROJECT TITLE:", "Chemical additives plant"),
      ("JESA DOCUMENT No:", "Q37440-00-EN-REG-00001"),
      ("ELECTRONIC FILE LOCATION:", "N/A"),
      ("NOTES:", "N/A"),
  ]

  for label, val in fields:
      # Create the full-width blue bar for the label
      label_para = Paragraph(label, label_style)
      label_table = Table([[label_para]], colWidths=[content_width], rowHeights=[7*mm])
      label_table.setStyle(TableStyle([
          ('BACKGROUND', (0, 0), (-1, -1), colors.HexColor("#1F497D")),
          ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
      ]))
      elements.append(label_table)
      elements.append(Spacer(1, 1.5*mm)) # Small space between label and value box

      # Create the smaller, bordered box for the value
      value_para = Paragraph(val, value_style)
      # The value box has a fixed width (50% of the page content width)
      value_box = Table([[value_para]], colWidths=[content_width / 2])
      value_box.setStyle(TableStyle([
          ('BOX', (0, 0), (-1, -1), 0.5, colors.grey),
          ('LEFTPADDING', (0, 0), (-1, -1), 4),
          ('TOPPADDING', (0, 0), (-1, -1), 2),
          ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
      ]))
      elements.append(value_box)

      # Add a larger space before the next field starts
      elements.append(Spacer(1, 5*mm))

  # --- 7. Add Bottom Status Table ---
  # Add significant space to push content towards bottom of page
  elements.append(Spacer(1, 40*mm))

  # Create the Originator/Issue Date table (first row)
  originator_table_data = [
      [
          Paragraph("Originator:", table_style),
          Paragraph("Y.Hosni", table_style),
          Paragraph("Issue Date:", table_style),
          Paragraph("18-Jun-25", table_style)
      ]
  ]

  originator_table = Table(
      originator_table_data,
      colWidths=[content_width * 0.15, content_width * 0.35, content_width * 0.15, content_width * 0.35]
  )

  originator_table.setStyle(TableStyle([
      ('BOX', (0, 0), (-1, -1), 1, colors.black),
      ('INNERGRID', (0, 0), (-1, -1), 0.5, colors.black),
      ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
      ('LEFTPADDING', (0, 0), (-1, -1), 3),
      ('RIGHTPADDING', (0, 0), (-1, -1), 3),
      ('TOPPADDING', (0, 0), (-1, -1), 4),
      ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
  ]))

  elements.append(originator_table)

  # Create the Document Status header
  status_header_data = [
      [Paragraph("DOCUMENT STATUS", table_header_style)]
  ]

  status_header_table = Table(
      status_header_data,
      colWidths=[content_width]
  )

  status_header_table.setStyle(TableStyle([
      ('BACKGROUND', (0, 0), (-1, -1), colors.HexColor("#1F497D")),
      ('BOX', (0, 0), (-1, -1), 1, colors.black),
      ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
      ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
      ('TOPPADDING', (0, 0), (-1, -1), 6),
      ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
  ]))

  elements.append(status_header_table)

  # Create the main status table with headers and data
  status_table_data = [
      # Data row B
      [
          Paragraph("B", table_style),
          Paragraph("18-Jun-25", table_style),
          Paragraph("Issued for Review (IFR)", table_style),
          Paragraph("Y.Hosni", table_style),
          Paragraph("S.El Alem", table_style),
          Paragraph("J.Alaoui Sosse", table_style),
          Paragraph("S. Paresh", table_style)
      ],
      # Data row A
      [
          Paragraph("A", table_style),
          Paragraph("11-Mar-25", table_style),
          Paragraph("Issued for Internal Review (IIR)", table_style),
          Paragraph("I.Issa Issaka", table_style),
          Paragraph("S.El Alem", table_style),
          Paragraph("J.Alaoui Sosse", table_style),
          Paragraph("S. Salim", table_style)
      ],
      [
          Paragraph("REV", table_style),
          Paragraph("DATE", table_style),
          Paragraph("DESCRIPTION", table_style),
          Paragraph("BY", table_style),
          Paragraph("CHKD", table_style),
          Paragraph("D.APPD", table_style),
          Paragraph("P.APPD", table_style)
      ],
  ]

  status_table = Table(
      status_table_data,
      colWidths=[
          content_width * 0.06,   # REV
          content_width * 0.12,   # DATE
          content_width * 0.35,   # DESCRIPTION
          content_width * 0.15,   # BY
          content_width * 0.12,   # CHKD
          content_width * 0.12,   # D.APPD
          content_width * 0.08    # P.APPD
      ]
  )
  num_rows = len(status_table_data)

  status_table.setStyle(TableStyle([
      ('BOX', (0, 0), (-1, -1), 1, colors.black),
      ('INNERGRID', (0, 0), (-1, -1), 0.5, colors.black),
      ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
      ('LEFTPADDING', (0, 0), (-1, -1), 3),
      ('RIGHTPADDING', (0, 0), (-1, -1), 3),
      ('TOPPADDING', (0, 0), (-1, -1), 4),
      ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
      # Light blue background for header row only
      ('BACKGROUND', (0, num_rows - 1), (-1, num_rows - 1), colors.HexColor("#E6F3FF")),

  ]))

  elements.append(status_table)

  # Copyright notice
  copyright_para = Paragraph(
      "© Copyright 2021 JESA Group. No part of this document or the information it contains may be reproduced or transmitted in any form or by any means electronic or mechanical, including photocopying, recording, or by any information storage and retrieval system, without permission in writing from JESA. JESA.com",
      ParagraphStyle(
          name="Copyright",
          fontName="Helvetica-bold",
          fontSize=7,
          textColor=colors.black,
          leading=9,
          alignment=0,
          spaceAfter=0,
      )
  )

  elements.append(Spacer(1, 4*mm))
  elements.append(copyright_para)

  # --- 8. Render the PDF ---
  doc.build(elements)
  print(f"✅ PDF successfully created: {PDF_PATH}")
generate_commitment_register_cover_page()


✅ PDF successfully created: commitment_register_cover_page.pdf


In [141]:
# @title  Final Commitment Register
from PyPDF2 import PdfMerger

def build_full_pdf(df_final ,output_name = "commitment_register.pdf" ):
    # 1. Generate A4 page
    generate_commitment_register_cover_page()

    generate_commitment_register_pdf(df_final)

    # 3. Merge both into one
    merger = PdfMerger()
    merger.append("commitment_register_cover_page.pdf")
    merger.append("commitment_register_final.pdf")
    merger.write(output_name)
    merger.close()

    print("✅ full_commitment_register.pdf successfully created")

build_full_pdf(df_final)



✅ PDF successfully created: commitment_register_cover_page.pdf
PDF successfully generated: commitment_register_final.pdf
✅ full_commitment_register.pdf successfully created


In [None]:
import gradio as gr
import pandas as pd
import json
import os
import tempfile
import asyncio
from datetime import datetime
import zipfile
from io import BytesIO
import traceback
import requests
from pathlib import Path
import nest_asyncio


# Import your web scraping functions
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from playwright.async_api import async_playwright

# Apply nest_asyncio for Jupyter/Colab compatibility
nest_asyncio.apply()

class EnvironmentalComplianceProcessor:
    def __init__(self):
        self.pdf_links = set()
        self.knowledge_base = None
        self.parsed_results = None
        self.commitment_df = None

    async def scrape_pdfs_from_url(self, base_url, max_pages=4):
        """Scrape PDFs from the provided URL"""
        pdf_links = set()

        async def extract_pdfs_from_html(html):
            soup = BeautifulSoup(html, "html.parser")
            new_links = set()

            for a in soup.find_all("a", href=True):
                href = a["href"]
                if href.lower().endswith(".pdf"):
                    full_link = urljoin(base_url, href)
                    new_links.add(full_link)

            return new_links

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context(user_agent=(
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/115.0 Safari/537.36"
            ))
            page = await context.new_page()

            try:
                await page.goto(base_url, wait_until="networkidle", timeout=30000)
                await asyncio.sleep(3)

                # Extract from main page
                main_html = await page.content()
                new_links = await extract_pdfs_from_html(main_html)
                pdf_links.update(new_links)

                # Extract from frames
                frames = page.frames
                for frame in frames:
                    try:
                        frame_html = await frame.content()
                        frame_links = await extract_pdfs_from_html(frame_html)
                        pdf_links.update(frame_links)
                    except:
                        continue

            except Exception as e:
                raise Exception(f"Error scraping {base_url}: {str(e)}")
            finally:
                await browser.close()

        return list(pdf_links)

    def download_pdfs(self, pdf_links, progress_callback=None):
        """Download PDFs from links to temporary directory"""
        temp_dir = tempfile.mkdtemp()
        downloaded_files = []

        for i, link in enumerate(pdf_links):
            try:
                if progress_callback:
                    progress_callback((i / len(pdf_links)) * 0.3, f"Downloading PDF {i+1}/{len(pdf_links)}")

                response = requests.get(link, timeout=30, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                })
                response.raise_for_status()

                # Extract filename from URL or create one
                filename = os.path.basename(link.split('?')[0])
                if not filename.endswith('.pdf'):
                    filename = f"document_{i+1}.pdf"

                filepath = os.path.join(temp_dir, filename)

                with open(filepath, 'wb') as f:
                    f.write(response.content)

                downloaded_files.append(filepath)

            except Exception as e:
                print(f"Failed to download {link}: {str(e)}")
                continue

        return downloaded_files

    def parse_pdfs_to_knowledge_base(self, pdf_files, progress_callback=None):
        """Parse PDFs and create knowledge base"""
        if progress_callback:
            progress_callback(0.3, "Initializing PDF parser...")

        # Configure parser
        config = TableValidationConfig(
            max_columns=10,
            min_rows=2,
            max_null_percentage=0.4,
            min_content_ratio=0.3
        )

        if progress_callback:
            progress_callback(0.4, f"Processing {len(pdf_files)} PDF files...")

        # Process PDFs
        batch_results = process_pdf_batch(pdf_files, config)
        self.parsed_results = batch_results

        if progress_callback:
            progress_callback(0.6, "Creating knowledge base...")

        # Create knowledge base
        self.knowledge_base = self.create_knowledge_base_from_results(batch_results)

        return batch_results

    def create_knowledge_base_from_results(self, batch_results):
        """Create knowledge base from parsing results"""
        knowledge_base = []

        for doc_path, doc_data in batch_results['results'].items():
            if 'error' in doc_data:
                continue

            full_text = ""
            for page in doc_data.get('pages', []):
                if 'content' in page:
                    if 'text_structure' in page['content']:
                        full_text += page['content']['text_structure'].get('title_text', '') + "\n"
                        full_text += page['content']['text_structure'].get('body_text', '') + "\n"
                    elif 'text' in page['content']:
                        full_text += page['content']['text'] + "\n"

            if full_text.strip():
                knowledge_base.append({
                    "filename": doc_data.get('filename', 'N/A'),
                    "document_type": doc_data.get('document_type', 'N/A'),
                    "content": full_text.lower()
                })

        return knowledge_base

    def process_commitments_with_ai(self, project_description, api_key, progress_callback=None):
        """Process commitment register with AI completion"""
        if not self.knowledge_base:
            raise Exception("No knowledge base available. Please process PDFs first.")

        if not api_key:
            raise Exception("Gemini API key is required")

        # Configure Gemini API
        import google.generativeai as genai
        genai.configure(api_key=api_key)

        if progress_callback:
            progress_callback(0.7, "Creating commitment register...")

        # Create default commitments DataFrame
        df = self.create_default_commitments_df()

        if progress_callback:
            progress_callback(0.75, "Analyzing commitments with AI...")

        # Process each commitment
        for index, row in df.iterrows():
            commitment_description = row[('Commitment Register Overview', 'Description', '')]
            commitment_id = row[('Commitment Register Overview', 'Commitment Identifier', '')]

            if progress_callback:
                progress = 0.75 + (0.15 * index / len(df))
                progress_callback(progress, f"Processing: {commitment_id}")

            # Find relevant documents
            relevant_texts = self.find_relevant_documents(commitment_description)

            if relevant_texts:
                try:
                    # Call Gemini to complete the row
                    extracted_data = call_gemini_to_complete_row(row, relevant_texts, project_description)

                    if extracted_data:
                        # Update DataFrame with AI results
                        self.update_commitment_row(df, index, extracted_data)

                    # Small delay to respect API limits
                    import time
                    time.sleep(1)

                except Exception as e:
                    print(f"Error processing commitment {commitment_id}: {str(e)}")
                    continue

        self.commitment_df = df
        return df

    def find_relevant_documents(self, commitment_description):
        """Find relevant documents from knowledge base"""
        if not self.knowledge_base:
            return ""

        commitment_description = commitment_description.lower()
        import re
        keywords = set(re.findall(r'\b[a-zA-Zçéàèùâêîôûæœ\d]{4,}\b', commitment_description))

        relevant_texts = []
        for doc in self.knowledge_base:
            if any(keyword in doc['filename'].lower() for keyword in keywords) or \
               any(keyword in doc['content'] for keyword in keywords):
                relevant_texts.append(f"--- START OF RELEVANT DOCUMENT ({doc['filename']}) ---\n{doc['content']}\n--- END OF DOCUMENT ---\n")

        return "\n".join(relevant_texts)

    def update_commitment_row(self, df, index, extracted_data):
        """Update DataFrame row with extracted data"""
        field_mappings = {
            "Impact or Hazard Addressed": ('Commitment Management', 'Impact or Hazard Addressed', ''),
            "Approving Agencies": ('Commitment Management', 'Approving Agencies', ''),
            "Comments": ('Commitment Management', 'Comments', ''),
            "Affected_Preparation_Construction": ('Commitment Management', 'Affected Areas or Processes', 'Preparation/construction'),
            "Affected_Operation": ('Commitment Management', 'Affected Areas or Processes', 'Operation'),
            "Affected_Discharge_Management": ('Commitment Management', 'Affected Areas or Processes', 'Discharge management'),
            "Impact_Health_Safety": ('Commitment Management', 'Impact', 'Health & Safety'),
            "Impact_Environmental": ('Commitment Management', 'Impact', 'Environmental'),
            "Impact_Regulatory": ('Commitment Management', 'Impact', 'Regulatory')
        }

        for key, column in field_mappings.items():
            if key in extracted_data:
                df.loc[index, column] = extracted_data[key]

    def create_default_commitments_df(self):
        """Create default commitments DataFrame"""
        header = [
            ('Commitment Register Overview', 'Register Identifier', ''),
            ('Commitment Register Overview', 'Commitment Identifier', ''),
            ('Commitment Register Overview', 'Commitment or Obligation', ''),
            ('Commitment Register Overview', 'Description', ''),
            ('Commitment Register Overview', 'Project Phase', ''),
            ('Commitment Management', 'Potential Impact on Scope?', ''),
            ('Commitment Management', 'Status', ''),
            ('Commitment Management', 'Commitment Deadline', ''),
            ('Commitment Management', 'First Lead', ''),
            ('Commitment Management', 'Second Lead', ''),
            ('Commitment Management', 'Third Lead', ''),
            ('Commitment Management', 'Primary Commitment Documentation', ''),
            ('Commitment Management', 'Impact or Hazard Addressed', ''),
            ('Commitment Management', 'Approving Agencies', ''),
            ('Commitment Management', 'Other Stakeholders', ''),
            ('Commitment Management', 'Affected Areas or Processes', 'Preparation/construction'),
            ('Commitment Management', 'Affected Areas or Processes', 'Operation'),
            ('Commitment Management', 'Affected Areas or Processes', 'Input Management'),
            ('Commitment Management', 'Affected Areas or Processes', 'Discharge management'),
            ('Commitment Management', 'Affected Areas or Processes', 'Off-Sites'),
            ('Commitment Management', 'Affected Areas or Processes', 'Other'),
            ('Commitment Management', 'Affected Areas or Processes', 'Fungibility'),
            ('Commitment Management', 'Impact', 'CAPEX'),
            ('Commitment Management', 'Impact', 'OPEX'),
            ('Commitment Management', 'Impact', 'Health & Safety'),
            ('Commitment Management', 'Impact', 'Social'),
            ('Commitment Management', 'Impact', 'Economic'),
            ('Commitment Management', 'Impact', 'Environmental'),
            ('Commitment Management', 'Impact', 'Regulatory'),
            ('Commitment Management', 'Comments', ''),
            ('Commitment Management', 'Requires Change Order?', '')
        ]

        columns = pd.MultiIndex.from_tuples(header)

        initial_commitments_data = [
            ["Moroccan environmental regulation", "Law n° 13-03 relating to the fight against air pollution", "Legal obligation", "Controlling the atmospheric emissions during industrial operations while ensuring good air quality.", "Design/Operation", "High", "In Progress", "During operational phase", "HSE Client", "Environment Client", "", "Environmental Report", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
            ["OCP Group objectives and commitment", "Liquid effluents policy", "Commitment", "Complying with legal regulatory national and international requirements for liquid discharge to ensure the prevention and control of related environmental risks.", "Design/Construction/Operation", "High", "In Progress", "During Design, construction and operation phases", "Process Engineering", "Environmental Engineering", "Civil Engineering", "Environmental Design criteria; Liquid effluents policy (OCP)", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
            ["OCP Group objectives and commitment", "Waste management policy", "Commitment", "Responsibly manage the waste generated by the project, respecting national and internationally recognized guidelines.", "Design/Construction/Operation", "High", "In Progress", "During design Construction and operation phases", "Environmental Engineering", "HSE Client", "Environment-Construction Contractors", "Waste management plan", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]
        ]

        return pd.DataFrame(initial_commitments_data, columns=columns)

    def generate_output_files(self, progress_callback=None):
        """Generate final PDF and CSV files"""
        if self.commitment_df is None:
            raise Exception("No commitment data available")

        if progress_callback:
            progress_callback(0.95, "Generating output files...")

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Generate CSV
        csv_filename = f"commitment_register_{timestamp}.csv"
        csv_path = os.path.join(tempfile.gettempdir(), csv_filename)
        self.commitment_df.to_csv(csv_path, index=False)

        # Generate PDF
        pdf_filename = f"commitment_register_{timestamp}.pdf"
        pdf_path = os.path.join(tempfile.gettempdir(), pdf_filename)
        build_full_pdf(self.commitment_df, pdf_path)

        return csv_path, pdf_path

    async def process_complete_workflow(self, url, project_description, api_key, max_pages=4, progress=gr.Progress()):
        """Complete workflow from URL to final outputs"""
        try:
            # Step 1: Scrape PDFs
            progress(0.0, "🔍 Scraping PDFs from website...")
            pdf_links = await self.scrape_pdfs_from_url(url, max_pages)

            if not pdf_links:
                return "❌ No PDF links found on the provided URL", None, None, None

            progress(0.1, f"📄 Found {len(pdf_links)} PDF links")

            # Step 2: Download PDFs
            progress(0.15, "⬇️ Downloading PDF files...")

            def download_progress(prog, desc):
                progress(0.15 + prog * 0.15, desc)

            pdf_files = self.download_pdfs(pdf_links, download_progress)

            if not pdf_files:
                return "❌ Failed to download any PDF files", None, None, None

            # Step 3: Parse PDFs
            progress(0.3, "🔬 Parsing PDF documents...")

            def parse_progress(prog, desc):
                progress(0.3 + prog * 0.4, desc)

            parsing_results = self.parse_pdfs_to_knowledge_base(pdf_files, parse_progress)

            # Step 4: Process with AI
            progress(0.7, "🤖 Processing commitments with AI...")

            def ai_progress(prog, desc):
                progress(0.7 + prog * 0.25, desc)

            commitment_df = self.process_commitments_with_ai(project_description, api_key, ai_progress)

            # Step 5: Generate outputs
            progress(0.95, "📊 Generating final outputs...")
            csv_path, pdf_path = self.generate_output_files(lambda p, d: progress(0.95 + p * 0.05, d))

            progress(1.0, "✅ Complete!")

            # Create summary
            summary = f"""
✅ **Workflow Complete!**

📊 **Processing Summary:**
- PDFs found: {len(pdf_links)}
- PDFs downloaded: {len(pdf_files)}
- PDFs successfully parsed: {parsing_results['summary']['successful']}
- Knowledge base documents: {len(self.knowledge_base)}
- Commitments processed: {len(commitment_df)}
- Average parsing confidence: {parsing_results['summary']['avg_confidence']:.2f}

🎯 **Results:**
- Commitment register completed with AI analysis
- Environmental compliance requirements identified
- Regulatory references integrated
- Impact assessments completed
            """

            return summary, csv_path, pdf_path, json.dumps(parsing_results, indent=2, ensure_ascii=False, default=str)

        except Exception as e:
            error_msg = f"❌ **Error in workflow:** {str(e)}\n\n**Traceback:**\n{traceback.format_exc()}"
            return error_msg, None, None, None

# Create the processor instance
processor = EnvironmentalComplianceProcessor()

# Define the streamlined Gradio interface
with gr.Blocks(title="Environmental Compliance Processor", theme=gr.themes.Soft()) as demo:
    gr.HTML("""
    <div style="text-align: center; padding: 20px; background: linear-gradient(90deg, #4CAF50, #2196F3); border-radius: 10px; margin: 10px;">
        <h1 style="color: white; margin: 0;">🌱 Environmental Compliance Processor</h1>
        <p style="color: white; margin: 5px 0;">Automated workflow: URL → PDF Scraping → Parsing → AI Analysis → Report Generation</p>
    </div>
    """)

    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### 🔗 Input Configuration")

            url_input = gr.Textbox(
                label="Website URL to Scrape",
                placeholder="https://environnement.gov.ma/fr/lois-et-reglementations/normes",
                value="https://environnement.gov.ma/fr/lois-et-reglementations/normes",
                info="Enter the URL containing environmental PDF documents"
            )

            with gr.Row():
                max_pages_input = gr.Number(
                    value=4,
                    label="Max Pages",
                    minimum=1,
                    maximum=10,
                    info="Maximum pages to scrape"
                )

                api_key_input = gr.Textbox(
                    label="Gemini API Key",
                    type="password",
                    placeholder="Enter your Google Gemini API key",
                    info="Required for AI-powered analysis"
                )

            project_description = gr.Textbox(
                label="Project Description",
                lines=6,
                value="""JESA has entered a reimbursable Work Order for elaborating the FEED (Evaluate+ Define) for this project.
This revision includes ESIA preparation, architectural activities for non-process buildings and master plan.
Division of Responsibilities: JESA has an EPCM reimbursable scope, with Customer coordination with authorities.
Project Objectives: Phase 2 development study and Class 4 estimate (+/- 20% to +/- 30%).
Activities include: Civil early works, Geo scan, Storage building, Environmental deliverables, and Mechanical ITBs.""",
                info="Context for AI analysis of commitments"
            )

            process_btn = gr.Button(
                "🚀 Start Complete Processing",
                variant="primary",
                size="lg"
            )

        with gr.Column(scale=3):
            gr.Markdown("### 📊 Results")

            status_output = gr.Textbox(
                label="Processing Status",
                lines=15,
                show_copy_button=True
            )

            with gr.Row():
                csv_download = gr.File(
                    label="📄 Download CSV Report",
                    visible=True
                )

                pdf_download = gr.File(
                    label="📄 Download PDF Report",
                    visible=True
                )

            with gr.Accordion("🔍 Detailed JSON Results", open=False):
                json_output = gr.Textbox(
                    label="Raw Parsing Results (JSON)",
                    lines=10,
                    show_copy_button=True
                )

    # Event handler for the main processing button
    async def process_workflow_wrapper(url, project_desc, api_key, max_pages):
        return await processor.process_complete_workflow(
            url, project_desc, api_key, max_pages
        )

    process_btn.click(
        fn=process_workflow_wrapper,
        inputs=[url_input, project_description, api_key_input, max_pages_input],
        outputs=[status_output, csv_download, pdf_download, json_output]
    )

    # Add examples section
    with gr.Accordion("💡 Usage Examples", open=False):
        gr.Markdown("""
        ### Example URLs to try:

        - **Moroccan Environmental Regulations:** `https://environnement.gov.ma/fr/lois-et-reglementations/normes`
        - **Ministry of Energy:** `https://www.mem.gov.ma/fr/Pages/secteur.aspx?e=3`
        - **Water Resources:** `https://www.water.gov.ma/reglementation/`

        ### What the system does:

        1. **🔍 Web Scraping:** Automatically finds and downloads PDF documents from the provided URL
        2. **🔬 PDF Analysis:** Uses advanced parsing to extract text, tables, and regulatory information
        3. **🤖 AI Processing:** Leverages Gemini AI to analyze commitments against environmental regulations
        4. **📊 Report Generation:** Creates professional PDF and CSV reports with compliance analysis

        ### Requirements:
        - Valid Google Gemini API key
        - URL containing environmental PDF documents
        - Internet connection for scraping and AI processing
        """)

    gr.HTML("""
    <div style="text-align: center; padding: 10px; color: #666;">
        <small>🔧 System Status: Ready for Environmental Compliance Processing</small>
    </div>
    """)

# Launch the interface
if __name__ == "__main__":
    demo.launch(
        share=True,
        debug=True,
    )

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://8a90f32c97671378bb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


ERROR:asyncio:Task was destroyed but it is pending!
task: <Task pending name='Task-93' coro=<Queue.start_progress_updates() running at /usr/local/lib/python3.11/dist-packages/gradio/queueing.py:359> wait_for=<Future pending cb=[Task.__wakeup()]>>
ERROR:asyncio:Task was destroyed but it is pending!
task: <Task pending name='Task-88' coro=<_delete_state() running at /usr/local/lib/python3.11/dist-packages/gradio/route_utils.py:980> wait_for=<Future pending cb=[Task.__wakeup()]>>
ERROR:asyncio:Task was destroyed but it is pending!
task: <Task pending name='Task-92' coro=<Queue.start_processing() running at /usr/local/lib/python3.11/dist-packages/gradio/queueing.py:309> wait_for=<Future pending cb=[Task.__wakeup()]>>


✅ PDF successfully created: commitment_register_cover_page.pdf
PDF successfully generated: commitment_register_final.pdf
✅ full_commitment_register.pdf successfully created
