# requirements

In [1]:
!pip install selenium webdriver-manager beautifulsoup4 requests pymupdf  playwright  chromium > /dev/null 2>&1
!pip install -U bitsandbytes==0.43.1 > /dev/null 2>&1
!pip install -q faiss-cpu==1.8.0 sentence-transformers==3.0.0 > /dev/null 2>&1
!pip install -U langchain langchain-community huggingface_hub pdfplumber transformers accelerate > /dev/null 2>&1


In [2]:
!playwright install > /dev/null 2>&1

# importante libaries

In [3]:
import re
import torch
import json
import fitz  # 
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline, AutoTokenizer, T5ForConditionalGeneration

2025-08-04 12:07:45.503362: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754309265.707865      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754309265.759917      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# scrapping

In [4]:
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from playwright.async_api import async_playwright

BASE_URL = "https://www.sgg.gov.ma/BulletinOfficiel.aspx"
pdf_links = set()
MAX_PAGES = 4

async def extract_pdfs_from_frame(frame):
    content = await frame.content()
    soup = BeautifulSoup(content, "html.parser")
    new_links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".pdf"):
            full_link = urljoin(BASE_URL, href)
            new_links.add(full_link)

    return new_links

async def run():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(BASE_URL)
        await page.wait_for_load_state("networkidle")

        # Find and switch to content iframe
        frame = None
        for f in page.frames:
            if "BulletinOfficiel" in f.url:
                frame = f
                break

        if not frame:
            print("⚠️ Content frame not found! Using main frame")
            frame = page.main_frame

        current_page = 1
        previous_links = set()

        while current_page <= MAX_PAGES:
            print(f"📄 Scraping page {current_page}...")

            # Extract PDF links
            new_links = await extract_pdfs_from_frame(frame)
            new_count = len(new_links - pdf_links)
            pdf_links.update(new_links)
            print(f"   ➕ Found {new_count} new PDFs (Total: {len(pdf_links)})")

            # Check for duplicate content (end of pages)
            if new_links and new_links == previous_links:
                print("🛑 Duplicate content detected - stopping pagination")
                break
            previous_links = new_links

            # Find next button by text
            next_buttons = await frame.query_selector_all("a")
            next_button = None

            for btn in next_buttons:
                text = await btn.text_content()
                if text and "Suivant" in text.strip():
                    next_button = btn
                    break

            if not next_button:
                print("⏹️ Next button not found - stopping pagination")
                break

            # Click next button and wait
            await next_button.click()
            await asyncio.sleep(2)  # Allow time for content load
            current_page += 1

        await browser.close()
        print(f"\n✅ Scraping completed. Total unique PDFs: {len(pdf_links)}")
        for link in sorted(pdf_links):
            print(link)

await run()


📄 Scraping page 1...
   ➕ Found 20 new PDFs (Total: 20)
📄 Scraping page 2...
   ➕ Found 10 new PDFs (Total: 30)
📄 Scraping page 3...
   ➕ Found 10 new PDFs (Total: 40)
📄 Scraping page 4...
   ➕ Found 10 new PDFs (Total: 50)

✅ Scraping completed. Total unique PDFs: 50
https://www.sgg.gov.ma/BO/FR/2873/2022/ECI_11_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2022/ECI_12_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2022/ECI_13_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2022/ECI_14_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2023/ECI_15_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2023/ECI_16_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2023/ECI_17_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2023/ECI_18_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2024/BO_7280_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2024/BO_7284_Fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2024/BO_7288_fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2024/BO_7292_fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/2024/BO_7296_fr.pdf
https://www.sgg.gov.ma/BO/FR/2873/202

In [5]:
import os
import fitz
import requests

os.makedirs("/pdfs", exist_ok=True)
pdf_texts = []
for url in pdf_links:
    filename = os.path.basename(url)
    path = os.path.join("/pdfs", filename)

    try:
        r = requests.get(url)
        with open(path, "wb") as f:
            f.write(r.content)

        doc = fitz.open(path)
        text = ""
        for page in doc:
            text += page.get_text()
        pdf_texts.append((filename, text[:1000]))
    except Exception as e:
        print(f"❌ Failed on {filename}: {e}")

# Preview sample content
pdf_names = []
for name, content in pdf_texts[:2]:
    print(f"\n📄 {name}")




📄 BO_7310_fr.pdf

📄 ECI_11_Fr.pdf


In [6]:
len(pdf_names)
pdf_names

[]

# Model data extraction

## --- Model Initialization ---

In [7]:
device = 0 if torch.cuda.is_available() else -1
device_name = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device_name}")

# Initialize models
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": device_name}
)

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
)

Using device: cuda


  embedding_model = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
