In [3]:
# %%capture
!pip install feedparser requests beautifulsoup4 lxml

import os
import re
import json
import time
import sqlite3
import hashlib
import threading
import concurrent.futures
from datetime import datetime
from urllib.parse import urljoin, urlparse

import requests
import feedparser
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import ipywidgets as widgets
from IPython.display import display, Markdown

# ----------------------------
# 1) Dictionnaire des flux ADEME
# ----------------------------
FEEDS = {
    "Agriculture, alimentation, for√™t, bio√©conomie": "https://librairie.ademe.fr/rss/3516-thematique-agriculture-alimentation-foret-bioeconomie.xml",
    "Air": "https://librairie.ademe.fr/rss/3145-thematique-air.xml",
    "B√¢timent": "https://librairie.ademe.fr/rss/3153-thematique-batiment.xml",
    "Changement climatique": "https://librairie.ademe.fr/rss/3147-thematique-changement-climatique.xml",
    "Consommer autrement": "https://librairie.ademe.fr/rss/2906-thematique-consommer-autrement.xml",
    "√âconomie circulaire et D√©chets": "https://librairie.ademe.fr/rss/3426-thematique-economie-circulaire-et-dechets.xml",
    "√ânergies": "https://librairie.ademe.fr/rss/3149-thematique-energies.xml",
    "Industrie et production durable": "https://librairie.ademe.fr/rss/3503-thematique-industrie-et-production-durable.xml",
    "Institutionnel": "https://librairie.ademe.fr/rss/3157-thematique-institutionnel.xml",
    "Mobilit√© et transports": "https://librairie.ademe.fr/rss/2901-thematique-mobilite-et-transports.xml",
    "Recherche et innovation": "https://librairie.ademe.fr/rss/2930-thematique-recherche-et-innovation.xml",
    "Soci√©t√© et politiques publiques": "https://librairie.ademe.fr/rss/3544-thematique-societe-et-politiques-publiques.xml",
    "Urbanisme, territoires et sols": "https://librairie.ademe.fr/rss/3509-thematique-urbanisme-territoires-et-sols.xml"
}

# ----------------------------
# 2) Classe Harvester (avec patch Prestashop)
# ----------------------------
class ShadowMassPDFHarvester:
    def __init__(self, max_workers=10, db_path='shadow_harvester.db'):
        self.session = requests.Session()
        self.max_workers = max_workers
        self.downloaded_urls = set()
        self.failed_urls = set()
        self.db_path = db_path
        self.db_lock = threading.Lock()

        # En-t√™tes HTTP
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })

        # Base de donn√©es
        self.init_database()

    def init_database(self):
        """Initialisation de la base de donn√©es Shadow (thread-safe)."""
        self.conn = sqlite3.connect(self.db_path, check_same_thread=False)
        cursor = self.conn.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS harvested_pdfs (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                url TEXT UNIQUE,
                filename TEXT,
                file_size INTEGER,
                file_hash TEXT,
                source_feed TEXT,
                harvest_date TIMESTAMP,
                status TEXT
            )
        ''')
        self.conn.commit()

    # ---------- D√©couverte / parsing g√©n√©rique (utiles si tu veux alimenter par sitemaps ou XML) ----------
    def load_feeds_from_xml(self, xml_content_or_url):
        """Charge et parse les feeds XML de bases de donn√©es"""
        feeds = []
        try:
            if isinstance(xml_content_or_url, str) and xml_content_or_url.startswith('http'):
                response = self.session.get(xml_content_or_url, timeout=15)
                response.raise_for_status()
                root = ET.fromstring(response.content)
            else:
                if isinstance(xml_content_or_url, str) and os.path.exists(xml_content_or_url):
                    tree = ET.parse(xml_content_or_url)
                    root = tree.getroot()
                else:
                    root = ET.fromstring(xml_content_or_url)

            for item in root.iter():
                feed_data = self._extract_feed_data(item)
                if feed_data and feed_data.get('url'):
                    feeds.append(feed_data)

        except Exception as e:
            print(f"[SHADOW] Erreur parsing XML: {e}")
        return feeds

    def _extract_feed_data(self, element):
        """Extraction agressive des donn√©es de feed"""
        feed_data = {}

        for attr, value in element.attrib.items():
            if any(kw in attr.lower() for kw in ['url', 'link', 'href', 'src']):
                feed_data['url'] = value
            if 'title' in attr.lower():
                feed_data['title'] = value

        if element.text and element.text.strip():
            if element.text.strip().startswith('http'):
                feed_data['url'] = element.text.strip()
            else:
                feed_data['title'] = element.text.strip()

        common_tags = {
            'link': 'url', 'url': 'url', 'guid': 'url',
            'title': 'title', 'description': 'description',
            'pubdate': 'date', 'updated': 'date'
        }

        for child in element:
            tag = child.tag.lower().split('}')[-1]
            if tag in common_tags and child.text:
                feed_data[common_tags[tag]] = child.text.strip()

        return feed_data if feed_data.get('url') else None

    def discover_feeds_from_sitemap(self, sitemap_url):
        """D√©couverte automatique de feeds via sitemaps"""
        print(f"[SHADOW] Exploration du sitemap: {sitemap_url}")
        try:
            response = self.session.get(sitemap_url, timeout=20)
            response.raise_for_status()
            root = ET.fromstring(response.content)

            namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            urls = []

            for url in root.findall('.//ns:url/ns:loc', namespace):
                if url.text:
                    urls.append({'url': url.text.strip(), 'source': 'sitemap'})

            for sm in root.findall('.//ns:sitemap/ns:loc', namespace):
                if sm.text:
                    urls.extend(self.discover_feeds_from_sitemap(sm.text.strip()))
            return urls

        except Exception as e:
            print(f"[ERREUR] Sitemap exploration: {e}")
            return []

    # ---------- Extraction / t√©l√©chargement ----------
    def scan_url_for_pdfs(self, url, download_dir, title=""):
        """Scan approfondi d'une URL pour trouver des PDFs"""
        pdf_urls = set()
        try:
            response = self.session.get(url, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            extraction_methods = [
                self._extract_from_links,
                self._extract_from_scripts,
                self._extract_from_meta,
                self._extract_from_iframes,
                self._extract_from_data_attributes,
                self._extract_from_json_ld,
                self._extract_from_prestashop_scripts,  # PATCH : extraction cibl√©e Prestashop
            ]
            for method in extraction_methods:
                try:
                    pdf_urls.update(method(soup, url))
                except Exception as e:
                    print(f"[WARN] M√©thode {method.__name__} √©chou√©e: {e}")

            os.makedirs(download_dir, exist_ok=True)
            downloaded = []
            for pdf_url in pdf_urls:
                if pdf_url not in self.downloaded_urls:
                    if self._download_pdf_advanced(pdf_url, download_dir, title):
                        self.downloaded_urls.add(pdf_url)
                        downloaded.append(pdf_url)
                        self._log_to_database(pdf_url, title, status='success')
                    else:
                        self.failed_urls.add(pdf_url)
                        self._log_to_database(pdf_url, title, status='failed')
            return downloaded

        except Exception as e:
            print(f"[ERREUR] Scan de {url}: {e}")
            return []

    def _extract_from_links(self, soup, base_url):
        pdf_urls = set()
        for a in soup.find_all('a', href=True):
            href = a['href'].strip()
            if 'controller=attachment' in href and 'id_attachment=' in href:
                pdf_urls.add(urljoin(base_url, href))
            elif self._is_pdf_link(href):
                pdf_urls.add(urljoin(base_url, href))
        return pdf_urls

    def _extract_from_scripts(self, soup, base_url):
        pdf_urls = set()
        for script in soup.find_all('script'):
            text = script.string or ''
            patterns = [
                r'["\'](https?://[^"\']+?\.pdf[^"\']*)["\']',
                r'(https?://[^\s<>"]+?\.pdf[^\s<>"]*)',
                r'pdfUrl[=:]\s*["\']([^"\']+\.pdf[^"\']*)["\']',
                r'download[^=]*=\s*["\']([^"\']+?\.pdf[^"\']*)["\']'
            ]
            for pattern in patterns:
                for match in re.findall(pattern, text, re.IGNORECASE):
                    pdf_urls.add(urljoin(base_url, match))
        return pdf_urls

    def _extract_from_meta(self, soup, base_url):
        pdf_urls = set()
        for meta in soup.find_all('meta', content=True):
            content = meta.get('content', '')
            if isinstance(content, str) and '.pdf' in content.lower():
                pdf_urls.add(urljoin(base_url, content))
        return pdf_urls

    def _extract_from_iframes(self, soup, base_url): t
        pdf_urls = set()
        for iframe in soup.find_all('iframe', src=True):
            src = iframe['src']
            if '.pdf' in src.lower():
                pdf_urls.add(urljoin(base_url, src))
        return pdf_urls

    def _extract_from_data_attributes(self, soup, base_url):
        """PATCH : √©vite d'interpr√©ter un JSON entier comme URL, et reconstruit l'endpoint d'attachement."""
        pdf_urls = set()

        def build_attachment_url(attach_id):
            return urljoin(base_url, f"/index.php?controller=attachment&id_attachment={attach_id}")

        for tag in soup.find_all(attrs=True):
            for _, value in tag.attrs.items():
                # Valeur simple (cha√Æne)
                if isinstance(value, str):
                    v = value.strip()

                    # (1) Si c'est une URL/chemin plausible, traitement classique
                    if (v.startswith('http') or v.startswith('/')) and self._is_pdf_link(v):
                        pdf_urls.add(urljoin(base_url, v))
                        continue

                    # (2) Si √ßa ressemble √† du JSON Prestashop, on essaie d'en tirer id_attachment
                    if v.startswith('{') and v.endswith('}'):
                        try:
                            data = json.loads(v)
                            if isinstance(data, dict) and 'attachments' in data and isinstance(data['attachments'], list):
                                for att in data['attachments']:
                                    att_id = att.get('id_attachment') or att.get('id')
                                    if att_id:
                                        pdf_urls.add(build_attachment_url(att_id))
                        except Exception:
                            pass

                # Valeur multiple (liste)
                elif isinstance(value, (list, tuple)):
                    for v in value:
                        if isinstance(v, str) and (v.startswith('http') or v.startswith('/')) and self._is_pdf_link(v):
                            pdf_urls.add(urljoin(base_url, v))

        return pdf_urls

    def _extract_from_json_ld(self, soup, base_url):
        pdf_urls = set()
        for script in soup.find_all('script', type='application/ld+json'):
            try:
                data = json.loads(script.string or '{}')
                pdf_urls.update(self._find_pdf_in_json(data, base_url))
            except Exception:
                pass
        return pdf_urls

    def _extract_from_prestashop_scripts(self, soup, base_url):
        """
        PATCH : Cherche des blocs JSON Prestashop contenant 'attachments' et reconstruit les URLs de t√©l√©chargement.
        """
        pdf_urls = set()

        def build_attachment_url(attach_id):
            return urljoin(base_url, f"/index.php?controller=attachment&id_attachment={attach_id}")

        for script in soup.find_all('script'):
            txt = script.string
            if not txt or 'attachments' not in txt:
                continue
            # Tentative prudente d'extraction
            # On ne force pas le parsing massif ; l'extraction data-attributes couvre d√©j√† la plupart des cas.
            # Ici, on cherche simplement des id_attachment: \d+
            try:
                # Cherche des patterns id_attachment dans le texte
                ids = set(re.findall(r'id_attachment"\s*:\s*(\d+)', txt))
                ids |= set(re.findall(r'id_attachment=\s*(\d+)', txt))
                for att_id in ids:
                    pdf_urls.add(build_attachment_url(att_id))
            except Exception:
                pass

        return pdf_urls

    def _find_pdf_in_json(self, data, base_url):
        pdf_urls = set()
        if isinstance(data, dict):
            for _, value in data.items():
                if isinstance(value, str) and '.pdf' in value.lower():
                    pdf_urls.add(urljoin(base_url, value))
                else:
                    pdf_urls.update(self._find_pdf_in_json(value, base_url))
        elif isinstance(data, list):
            for item in data:
                pdf_urls.update(self._find_pdf_in_json(item, base_url))
        return pdf_urls

    def _is_pdf_link(self, href):
        """PATCH : √©vite les faux positifs JSON et accepte les endpoints d‚Äôattachements Prestashop."""
        if not isinstance(href, str):
            return False
        href = href.strip()

        # Cas Prestashop: endpoint d‚Äôattachement
        if 'controller=attachment' in href and 'id_attachment=' in href:
            return True

        # URLs ou chemins plausibles
        if href.startswith('http') or href.startswith('/'):
            parsed = urlparse(href)
            path_has_pdf = parsed.path.lower().endswith('.pdf')
            query_has_pdf = '.pdf' in (parsed.query or '').lower()
            return path_has_pdf or query_has_pdf

        return False

    def _download_pdf_advanced(self, pdf_url, download_dir, title=""):
        try:
            # Nom de fichier
            if title:
                safe_title = re.sub(r'[^\w\-. ]', '', title).strip() or "document"
                filename = f"{safe_title}_{int(time.time())}.pdf"
            else:
                filename = os.path.basename(urlparse(pdf_url).path) or f"document_{int(time.time())}.pdf"
            filepath = os.path.join(download_dir, filename)

            print(f"üì• T√©l√©chargement: {pdf_url}")
            resp = self.session.get(pdf_url, timeout=30)
            resp.raise_for_status()

            content = resp.content
            if not content.startswith(b'%PDF'):
                print(f"‚ö†Ô∏è  Non-PDF d√©tect√© (signature manquante): {pdf_url}")
                return False

            with open(filepath, 'wb') as f:
                f.write(content)

            size = os.path.getsize(filepath)
            if size < 1000:
                os.remove(filepath)
                print(f"‚ö†Ô∏è  Fichier trop petit, supprim√©: {filename}")
                return False

            print(f"‚úÖ Sauvegard√©: {filename} ({size} bytes)")
            return True

        except Exception as e:
            print(f"‚ùå √âchec t√©l√©chargement {pdf_url}: {e}")
            return False

    def _log_to_database(self, pdf_url, title, status):
        """Journalisation (prot√©g√©e par verrou)"""
        try:
            with self.db_lock:
                cursor = self.conn.cursor()
                file_hash = hashlib.md5(pdf_url.encode()).hexdigest()
                cursor.execute('''
                    INSERT OR REPLACE INTO harvested_pdfs 
                    (url, filename, file_size, file_hash, source_feed, harvest_date, status)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                ''', (pdf_url, title, 0, file_hash, 'rss_article_scan', datetime.now(), status))
                self.conn.commit()
        except Exception as e:
            print(f"[DB ERROR] {e}")

    def generate_report(self):
        cursor = self.conn.cursor()
        cursor.execute('SELECT status, COUNT(*) FROM harvested_pdfs GROUP BY status')
        stats = dict(cursor.fetchall())
        print("\n" + "="*50)
        print("üìä RAPPORT SHADOW HARVESTER")
        print("="*50)
        for status, count in stats.items():
            print(f"   {status.upper()}: {count}")
        print(f"\nüéØ TOTAL PDFS (session): {len(self.downloaded_urls)}")
        print(f"üí• √âCHECS (session): {len(self.failed_urls)}")
        print("="*50)


# ----------------------------
# 3) Interface Jupyter + int√©gration Harvester
# ----------------------------
theme_selector = widgets.Dropdown(
    options=list(FEEDS.keys()),
    description='Th√®me :',
    layout=widgets.Layout(width='60%')
)

btn_list = widgets.Button(description='Afficher les articles üì∞', button_style='success')
btn_download = widgets.Button(description='Scanner & T√©l√©charger les PDFs ‚¨áÔ∏è', button_style='info')
output = widgets.Output()

display(theme_selector, widgets.HBox([btn_list, btn_download]), output)

harvester = ShadowMassPDFHarvester(max_workers=10, db_path='shadow_harvester.db')
_last_entries = []  # stocke les entr√©es list√©es pour r√©utilisation au t√©l√©chargement

def _sanitize_dirname(name: str) -> str:
    return re.sub(r'[^A-Za-z0-9_\- ]', '_', name).strip().replace(' ', '_') or 'ademe'

def on_list_clicked(_):
    output.clear_output()
    theme = theme_selector.value
    url = FEEDS[theme]
    with output:
        print(f"üîó Chargement du flux : {theme}")
        feed = feedparser.parse(url)
        if not feed.entries:
            print("‚ö†Ô∏è Aucun article trouv√©.")
            return
        print(f"‚úÖ {len(feed.entries)} article(s) trouv√©s :\n")
        global _last_entries
        _last_entries = feed.entries  # m√©morise pour le bouton t√©l√©chargement
        for i, entry in enumerate(feed.entries, 1):
            title = entry.get("title", "")
            link = entry.get("link", "")
            date = entry.get("published", "")
            display(Markdown(f"**{i}. [{title}]({link})**"))
            if date:
                display(Markdown(f"_üìÖ {date}_"))
            print()

def on_download_clicked(_):
    theme = theme_selector.value
    download_dir = f"ademe_pdfs_{_sanitize_dirname(theme)}"
    with output:
        if not _last_entries:
            print("‚ÑπÔ∏è D‚Äôabord, clique sur ¬´ Afficher les articles ¬ª pour charger la liste.")
            return
        print(f"üöÄ Scan et t√©l√©chargement des PDFs li√©s aux {_sanitize_dirname(theme)} ‚Ä¶")
        total_found = 0
        for i, entry in enumerate(_last_entries, 1):
            title = entry.get("title", "") or f"article_{i}"
            link = entry.get("link", "")
            if not link:
                continue
            print(f"\n[{i}] üîé {title}\nURL: {link}")
            found = harvester.scan_url_for_pdfs(link, download_dir, title=title)
            print(f"‚û°Ô∏è  PDFs trouv√©s: {len(found)}")
            total_found += len(found)

        print("\n‚Äî" * 30)
        print(f"üì¶ Total PDFs t√©l√©charg√©s: {total_found} (dossier: {download_dir})")
        harvester.generate_report()
        print("‚úÖ Termin√©.")

btn_list.on_click(on_list_clicked)
btn_download.on_click(on_download_clicked)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Dropdown(description='Th√®me :', layout=Layout(width='60%'), options=('Agriculture, alimentation, for√™t, bio√©co‚Ä¶

HBox(children=(Button(button_style='success', description='Afficher les articles üì∞', style=ButtonStyle()), But‚Ä¶

Output()