In [None]:
import os
import json
import time
from PIL import Image, ImageDraw
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


In [None]:
# Configuration complète
CLASSES = {
    "header": {
        "keywords": ["header", "navbar", "head"],
        "color": "green"  # Vert vif
    },
    "footer": {
        "keywords": ["footer", "bottom", "copyright"],
        "color": "blue"  # Bleu
    },
    "content": {
        "keywords": ["main", "content", "article"],
        "color": "orange"  # orange 
    },
    "media": {
        "keywords": ["img", "video", "picture"],
        "color": "pink"  # rose
    },
    "sidebar": {
        "keywords": ["sidebar", "aside", "panel"],
        "color": "yellow"  # Jaune
    },
    "ads": {
        "keywords": ["ad", "banner", "advert","advertisement"],
        "color": "red"  # Rouge vif
    }
}

In [None]:
def load_metadata():
    """Charge les métadonnées depuis le fichier JSON"""
    with open("dataset_metadata.json") as f:
        return json.load(f)


In [7]:
pip install shapely

Collecting shapely
  Downloading shapely-2.0.7-cp313-cp313-win_amd64.whl.metadata (7.1 kB)
Downloading shapely-2.0.7-cp313-cp313-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.4 MB ? eta -:--:--
   ----------------------------- ---------- 1.0/1.4 MB 3.6 MB/s eta 0:00:01
   ----------------------------- ---------- 1.0/1.4 MB 3.6 MB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.4 MB 1.6 MB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.4 MB 1.6 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 1.2 MB/s eta 0:00:00
Installing collected packages: shapely
Successfully installed shapely-2.0.7
Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
import json
import time
from PIL import Image, ImageDraw
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from shapely.geometry import box as shapely_box
from shapely.ops import unary_union

# Configuration complète
CLASSES = {
    "header": {
        "keywords": ["header", "navbar", "head"],
        "color": "green"  # Vert vif
    },
    "footer": {
        "keywords": ["footer", "bottom", "copyright"],
        "color": "blue"  # Bleu
    },
    "content": {
        "keywords": ["main", "content", "article"],
        "color": "orange"  # orange 
    },
    "media": {
        "keywords": ["img","imahe", "video", "picture","photo"],
        "color": "pink"  # rose
    },
    "sidebar": {
        "keywords": ["sidebar", "aside", "panel","grid","menu"],
        "color": "yellow"  # Jaune
    },
    "ads": {
        "keywords": ["ad", "banner", "advert","advertisement","pub"],
        "color": "red"  # Rouge vif
    }
}

def load_metadata():
    """Charge les métadonnées depuis le fichier JSON"""
    with open("dataset_metadata.json") as f:
        return json.load(f)

def analyze_page(driver, url):
    """Analyse complète de la page"""
    driver.get(url)
    time.sleep(3)  # Attente critique pour le chargement
    
    # Récupération des dimensions du DOM
    dom_size = driver.execute_script("""
        return {
            width: Math.max(document.body.scrollWidth, document.documentElement.scrollWidth),
            height: Math.max(document.body.scrollHeight, document.documentElement.scrollHeight)
        }
    """)
    
    # Récupération de tous les éléments avec leur bounding box
    elements = driver.execute_script("""
        const elements = [];
        const all = document.querySelectorAll('*');
        
        all.forEach(el => {
            try {
                const rect = el.getBoundingClientRect();
                if (rect.width > 0 && rect.height > 0) {
                    elements.push({
                        tag: el.tagName,
                        classes: Array.from(el.classList),
                        id: el.id || null,
                        rect: {
                            x: rect.left,
                            y: rect.top,
                            width: rect.width,
                            height: rect.height
                        }
                    });
                }
            } catch(e) {}
        });
        
        return elements;
    """)
    
    return dom_size, elements

def match_element(element, classes):
    """Trouve la classe correspondante pour un élément"""
    for class_name, config in classes.items():
        # Vérification par tag
        if element['tag'].lower() in config['keywords']:
            return class_name, config['color']
        
        # Vérification par classe CSS
        for cls in element['classes']:
            if any(kw.lower() in cls.lower() for kw in config['keywords']):
                return class_name, config['color']
        
        # Vérification par ID
        if element['id'] and any(kw.lower() in element['id'].lower() for kw in config['keywords']):
            return class_name, config['color']
    
    return None, None

def annotate_image():
    # Initialisation
    metadata = load_metadata()
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    
    # Création du dossier annoté
    os.makedirs("annotated", exist_ok=True)
    
    # Traitement de 2 images (comme demandé)
    test_images = list(metadata.items())[:2]
    
    for img_name, info in test_images:
        try:
            print(f"\nTraitement de {img_name}...")
            
            # Chemins des fichiers
            img_path = os.path.join("dataset_images", img_name)
            output_path = os.path.join("annotated", img_name)
            
            # Chargement de l'image
            img = Image.open(img_path)
            img_width, img_height = img.size
            
            # Analyse de la page
            dom_size, dom_elements = analyze_page(driver, info['url'])
            
            # Calcul des ratios de scaling
            scale_x = img_width / dom_size['width']
            scale_y = img_height / dom_size['height']
            
            # Préparation de l'annotation
            draw = ImageDraw.Draw(img)
            # Séparer les éléments par classe
            classified_elements = {cls: [] for cls in CLASSES.keys()}

            for element in dom_elements:
                class_name, color = match_element(element, CLASSES)
                if class_name:
                    rect = element['rect']
                    x1 = int(rect['x'] * scale_x)
                    y1 = int(rect['y'] * scale_y)
                    x2 = int((rect['x'] + rect['width']) * scale_x)
                    y2 = int((rect['y'] + rect['height']) * scale_y)
                    if x1 < img_width and y1 < img_height and x2 > 0 and y2 > 0:
                        classified_elements[class_name].append({
                        "box": [x1, y1, x2, y2],
                        "color": CLASSES[class_name]['color']
            })

# Fonction pour dessiner une box
            def draw_labeled_box(draw, box, label, color):
                draw.rectangle(box, outline=color, width=3)
                draw.text((box[0] + 5, box[1] + 5), label, fill=color)

# Annoter uniquement 1 header : le plus proche du haut
            if classified_elements["header"]:
                top_header = min(classified_elements["header"], key=lambda e: e["box"][1])  # y1
                draw_labeled_box(draw, top_header["box"], "header", top_header["color"])

# Annoter uniquement 1 footer : le plus proche du bas
            if classified_elements["footer"]:
                bottom_footer = max(classified_elements["footer"], key=lambda e: e["box"][3])  # y2
                draw_labeled_box(draw, bottom_footer["box"], "footer", bottom_footer["color"])


            # 3. CONTENT — Prendre le plus grand en surface
            if classified_elements["content"]:
                largest_content = max(classified_elements["content"], key=lambda e: (e["box"][2]-e["box"][0]) * (e["box"][3]-e["box"][1]))
                draw_labeled_box(draw, largest_content["box"], "content", largest_content["color"])

# 4. MEDIA & ADS — On les dessine normalement
            for cls in ["media", "ads"]:
                for el in classified_elements[cls]:
                    draw_labeled_box(draw, el["box"], cls, el["color"])

# 5. SIDEBAR — Supprimer les chevauchements et fusionner
            def merge_overlapping_boxes(boxes):
    # Transformer en géométrie shapely
                geometries = [shapely_box(*b["box"]) for b in boxes]
    # Union des zones qui se chevauchent
                merged = unary_union(geometries)
    
    # Si plusieurs polygones restent séparés
                if merged.geom_type == "Polygon":
                    merged = [merged]
                else:
                    merged = list(merged.geoms)
    
    # Reconvertir en boîtes
                merged_boxes = []
                for geom in merged:
                    minx, miny, maxx, maxy = geom.bounds
                    merged_boxes.append([int(minx), int(miny), int(maxx), int(maxy)])
    
                return merged_boxes

            if classified_elements["sidebar"]:
                merged_sidebars = merge_overlapping_boxes(classified_elements["sidebar"])
                for box in merged_sidebars:
                    draw_labeled_box(draw, box, "sidebar", CLASSES["sidebar"]["color"])
            
            
            
            # Sauvegarde
            img.save(output_path)
            print(f"✅ Annotation réussie -> {output_path}")
            
        except Exception as e:
            print(f"❌ Erreur sur {img_name}: {str(e)}")
    
    driver.quit()
    print("\nTraitement terminé !")

if __name__ == "__main__":
    annotate_image()


Traitement de blog_OR_politics_OR_news_0_1743280479.jpg...
✅ Annotation réussie -> annotated\blog_OR_politics_OR_news_0_1743280479.jpg

Traitement de blog_OR_politics_OR_news_1_1743280499.jpg...
✅ Annotation réussie -> annotated\blog_OR_politics_OR_news_1_1743280499.jpg

Traitement terminé !


# ajuster position des boxes

In [15]:
import os
import json
import time
from PIL import Image, ImageDraw
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from shapely.geometry import box as shapely_box
from shapely.ops import unary_union

# Configuration complète
CLASSES = {
    "header": {
        "keywords": ["header", "navbar", "head"],
        "color": "green"  # Vert vif
    },
    "footer": {
        "keywords": ["footer", "bottom", "copyright"],
        "color": "blue"  # Bleu
    },
    "content": {
        "keywords": ["main", "content", "article"],
        "color": "orange"  # orange 
    },
    "media": {
        "keywords": ["img","imahe", "video", "picture","photo"],
        "color": "pink"  # rose
    },
    "sidebar": {
        "keywords": ["sidebar", "aside", "panel","grid","menu"],
        "color": "yellow"  # Jaune
    },
    "ads": {
        "keywords": ["ad", "banner", "advert","advertisement","pub"],
        "color": "red"  # Rouge vif
    }
}

def load_metadata():
    """Charge les métadonnées depuis le fichier JSON"""
    with open("dataset_metadata.json") as f:
        return json.load(f)

def analyze_page(driver, url):
    """Analyse complète de la page"""
    driver.get(url)
    time.sleep(3)  # Attente critique pour le chargement
    
    # Récupération des dimensions du DOM
    dom_size = driver.execute_script("""
        return {
            width: Math.max(document.body.scrollWidth, document.documentElement.scrollWidth),
            height: Math.max(document.body.scrollHeight, document.documentElement.scrollHeight)
        }
    """)
    
    # Récupération de tous les éléments avec leur bounding box
    elements = driver.execute_script("""
        const elements = [];
        const all = document.querySelectorAll('*');
        
        all.forEach(el => {
            try {
                const rect = el.getBoundingClientRect();
                if (rect.width > 0 && rect.height > 0) {
                    elements.push({
                        tag: el.tagName,
                        classes: Array.from(el.classList),
                        id: el.id || null,
                        rect: {
                            x: rect.left,
                            y: rect.top,
                            width: rect.width,
                            height: rect.height
                        }
                    });
                }
            } catch(e) {}
        });
        
        return elements;
    """)
    
    return dom_size, elements

def match_element(element, classes):
    """Trouve la classe correspondante pour un élément"""
    for class_name, config in classes.items():
        # Vérification par tag
        if element['tag'].lower() in config['keywords']:
            return class_name, config['color']
        
        # Vérification par classe CSS
        for cls in element['classes']:
            if any(kw.lower() in cls.lower() for kw in config['keywords']):
                return class_name, config['color']
        
        # Vérification par ID
        if element['id'] and any(kw.lower() in element['id'].lower() for kw in config['keywords']):
            return class_name, config['color']
    
    return None, None

def annotate_image():
    # Initialisation
    metadata = load_metadata()
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    
    # Création du dossier annoté
    os.makedirs("annotated", exist_ok=True)
    
    # Traitement de 2 images (comme demandé)
    test_images = list(metadata.items())[:5]
    
    for img_name, info in test_images:
        try:
            print(f"\nTraitement de {img_name}...")
            
            # Chemins des fichiers
            img_path = os.path.join("dataset_images", img_name)
            output_path = os.path.join("annotated", img_name)
            
            # Chargement de l'image
            img = Image.open(img_path)
            img_width, img_height = img.size
            
            # Analyse de la page
            dom_size, dom_elements = analyze_page(driver, info['url'])
            # Calcul des ratios de scaling
            scale_x = img_width / dom_size['width'] 
            scale_y = img_height / dom_size['height']
          #   if dom_size['height'] > img_height:
          #      scale_y = img_height / dom_size['height']
          #   else:
          #      scale_y = 1.0  # Pas besoin de scaler en hauteur si l'image est plus grande

            # Préparation de l'annotation
            draw = ImageDraw.Draw(img)
            # Séparer les éléments par classe
            classified_elements = {cls: [] for cls in CLASSES.keys()}

            for element in dom_elements:
                class_name, color = match_element(element, CLASSES)
                if class_name:
                    rect = element['rect']
                    x1 = int(rect['x'] * scale_x)
                    y1 = int(rect['y'] * scale_y)
                    x2 = int((rect['x'] + rect['width']) * scale_x)
                    y2 = int((rect['y'] + rect['height']) * scale_y)
                    if x1 < img_width and y1 < img_height and x2 > 0 and y2 > 0:
                        classified_elements[class_name].append({
                        "box": [x1, y1, x2, y2],
                        "color": CLASSES[class_name]['color']
            })

# Fonction pour dessiner une box
            def draw_labeled_box(draw, box, label, color):
                draw.rectangle(box, outline=color, width=3)
                draw.text((box[0] + 5, box[1] + 5), label, fill=color)

# Annoter uniquement 1 header : le plus proche du haut
            if classified_elements["header"]:
                top_header = min(classified_elements["header"], key=lambda e: e["box"][1])  # y1
                draw_labeled_box(draw, top_header["box"], "header", top_header["color"])

# Annoter uniquement 1 footer : le plus proche du bas
            if classified_elements["footer"]:
                bottom_footer = max(classified_elements["footer"], key=lambda e: e["box"][3])  # y2
                draw_labeled_box(draw, bottom_footer["box"], "footer", bottom_footer["color"])


            # 3. CONTENT — Prendre le plus grand en surface
            if classified_elements["content"]:
                largest_content = max(classified_elements["content"], key=lambda e: (e["box"][2]-e["box"][0]) * (e["box"][3]-e["box"][1]))
                draw_labeled_box(draw, largest_content["box"], "content", largest_content["color"])

# 4. MEDIA & ADS — On les dessine normalement
            for cls in ["media", "ads"]:
                for el in classified_elements[cls]:
                    draw_labeled_box(draw, el["box"], cls, el["color"])

# 5. SIDEBAR — Supprimer les chevauchements et fusionner
            def merge_overlapping_boxes(boxes):
    # Transformer en géométrie shapely
                geometries = [shapely_box(*b["box"]) for b in boxes]
    # Union des zones qui se chevauchent
                merged = unary_union(geometries)
    
    # Si plusieurs polygones restent séparés
                if merged.geom_type == "Polygon":
                    merged = [merged]
                else:
                    merged = list(merged.geoms)
    
    # Reconvertir en boîtes
                merged_boxes = []
                for geom in merged:
                    minx, miny, maxx, maxy = geom.bounds
                    merged_boxes.append([int(minx), int(miny), int(maxx), int(maxy)])
    
                return merged_boxes

            if classified_elements["sidebar"]:
                merged_sidebars = merge_overlapping_boxes(classified_elements["sidebar"])
                for box in merged_sidebars:
                    draw_labeled_box(draw, box, "sidebar", CLASSES["sidebar"]["color"])
            
            
            
            # Sauvegarde
            img.save(output_path)
            print(f"✅ Annotation réussie -> {output_path}")
            
        except Exception as e:
            print(f"❌ Erreur sur {img_name}: {str(e)}")
    
    driver.quit()
    print("\nTraitement terminé !")

if __name__ == "__main__":
    annotate_image()


Traitement de blog_OR_politics_OR_news_0_1743280479.jpg...
✅ Annotation réussie -> annotated\blog_OR_politics_OR_news_0_1743280479.jpg

Traitement de blog_OR_politics_OR_news_1_1743280499.jpg...
✅ Annotation réussie -> annotated\blog_OR_politics_OR_news_1_1743280499.jpg

Traitement de blog_OR_politics_OR_news_2_1743280516.jpg...
✅ Annotation réussie -> annotated\blog_OR_politics_OR_news_2_1743280516.jpg

Traitement de blog_OR_politics_OR_news_3_1743280531.jpg...
✅ Annotation réussie -> annotated\blog_OR_politics_OR_news_3_1743280531.jpg

Traitement de blog_OR_politics_OR_news_4_1743280540.jpg...
✅ Annotation réussie -> annotated\blog_OR_politics_OR_news_4_1743280540.jpg

Traitement terminé !
