In [None]:
# -*- coding: utf-8 -*-
import os, re, json, time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

# ================== CONFIG ==================
BASE_LIST_URL    = "https://www.credit-agricole.fr/professionnel/agence.html"
CHECKPOINT_FILE  = "agence_ca/ca_checkpoint.json"
CITY_COUNT_FILE  = "agence_ca/credit_agricole_city_counts.csv"
OUTPUT_DIR       = "agence_ca/regions"  # <-- un fichier par région ici
BRAND            = "Crédit Agricole"

PAGELOAD_TIMEOUT = 60
WAIT             = 12
HEADLESS         = False  # True pour headless
# ============================================


# ---------- Driver ----------
def setup_driver():
    options = webdriver.ChromeOptions()
    if HEADLESS:
        options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1400,1000")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.set_page_load_timeout(PAGELOAD_TIMEOUT)
    return driver

def click_js(driver, element):
    driver.execute_script("arguments[0].click();", element)


# ---------- Utils ----------
DAY_WORDS = ["Lundi","Mardi","Mercredi","Jeudi","Vendredi","Samedi","Dimanche"]

def clean_phone(txt: str) -> str:
    if not txt: return ""
    txt = re.sub(r"(?i)\bAppeler\s*", "", txt)
    txt = re.sub(r"[^\d+ ]+", "", txt)
    return re.sub(r"\s{2,}", " ", txt).strip()

def looks_like_opening_line(s: str) -> bool:
    return any(s.startswith(d) for d in DAY_WORDS)

def accept_cookies(driver):
    selectors = [
        (By.CSS_SELECTOR, "button[aria-label*='Accepter']"),
        (By.CSS_SELECTOR, "button[title*='Accepter']"),
        (By.XPATH, "//button[contains(., 'Tout accepter') or contains(., 'Accepter')]"),
        (By.CSS_SELECTOR, "#tc_privacy_button_2"),
        (By.CSS_SELECTOR, "button#didomi-notice-agree-button"),
    ]
    for by, sel in selectors:
        try:
            btn = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((by, sel)))
            click_js(driver, btn)
            time.sleep(0.4)
            return
        except Exception:
            pass

def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
            data = json.load(f)
            if "done_agency_urls" not in data:
                data["done_agency_urls"] = []
            return data
    return {"done_city_urls": [], "done_agency_urls": []}

def save_checkpoint(cp):
    os.makedirs(os.path.dirname(CHECKPOINT_FILE), exist_ok=True)
    with open(CHECKPOINT_FILE, "w", encoding="utf-8") as f:
        json.dump(cp, f, ensure_ascii=False, indent=2)

def to_fr_decimal(x: str) -> str:
    if not x: return ""
    return str(x).replace(".", ",")

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^\w\- ]+", "", s, flags=re.U)
    s = s.replace(" ", "-")
    s = re.sub(r"-{2,}", "-", s)
    return s or "region"

def append_city_count(row_dict):
    ensure_dir(os.path.dirname(CITY_COUNT_FILE))
    df = pd.DataFrame([row_dict])
    if not os.path.exists(CITY_COUNT_FILE):
        df.to_csv(CITY_COUNT_FILE, index=False, encoding="utf-8-sig")
    else:
        df.to_csv(CITY_COUNT_FILE, mode="a", header=False, index=False, encoding="utf-8-sig")


# ---------- Écriture "un fichier par région" ----------
def region_output_path(region_name: str) -> str:
    ensure_dir(OUTPUT_DIR)
    filename = f"credit_agricole_{slugify(region_name)}.csv"
    return os.path.join(OUTPUT_DIR, filename)

def append_one_row_fr_region(row_dict, region_name: str):
    """
    Sauvegarde immédiate d'UNE agence dans le CSV de la région.
    Colonnes: nom;adresse;code_postal;latitude;longitude;region_source
    """
    path = region_output_path(region_name)
    df = pd.DataFrame([{
        "nom": row_dict.get("nom",""),
        "adresse": row_dict.get("adresse",""),
        "code_postal": row_dict.get("code_postal",""),
        "latitude": to_fr_decimal(row_dict.get("latitude","")),
        "longitude": to_fr_decimal(row_dict.get("longitude","")),
        "region_source": f"{BRAND} - {region_name}",
    }])
    newfile = not os.path.exists(path)
    df.to_csv(path, mode="a", header=newfile, index=False, sep=";", encoding="utf-8-sig")


# ---------- Navigation (régions / villes) ----------
def get_region_links(driver):
    links = set()
    for a in driver.find_elements(By.CSS_SELECTOR, "div.indexCR-Content ul > li > a"):
        href = a.get_attribute("href")
        if href and "/agence/" in href:
            links.add(href)
    if not links:
        nodes = driver.find_elements(By.XPATH, "//*[@id='content']//div[contains(@class,'indexCR-Content')]//ul/li/a")
        for n in nodes:
            href = n.get_attribute("href")
            if href and "/agence/" in href:
                links.add(href)
    return list(sorted(links))

def extract_region_name(driver, region_url) -> str:
    def slug_to_name(u: str) -> str:
        slug = u.rstrip("/").split("/")[-1].split(".")[0]  # ex. alpes-provence
        return " ".join(p.capitalize() for p in slug.split("-"))

    try:
        driver.get(region_url)
        WebDriverWait(driver, WAIT).until(EC.presence_of_element_located((By.ID, "content")))
        accept_cookies(driver)
        time.sleep(0.2)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        h = soup.find(["h1","h2"])
        if h:
            t = h.get_text(" ", strip=True)
            # si le H1 contient un message technique, on ignore
            if "FIN DE CONNEXION" in t.upper():
                return slug_to_name(region_url)
            m = re.search(r"Nos agences.*?Crédit Agricole\s+(.+)", t, re.I)
            if m:
                name = m.group(1).strip(" .")
                if name and "FIN DE CONNEXION" not in name.upper():
                    return name
            # Sinon on tente “Crédit Agricole <Nom>”
            m2 = re.search(r"Crédit Agricole\s+(.+)", t, re.I)
            if m2:
                name = m2.group(1).strip(" .")
                if name and "FIN DE CONNEXION" not in name.upper():
                    return name
        # dernier recours : slug
        return slug_to_name(region_url)
    except Exception:
        return slug_to_name(region_url)


def get_city_links_from_region(driver, region_url):
    # la page est déjà ouverte par extract_region_name; on y reste
    links, seen = [], set()
    for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        for a in driver.find_elements(By.CSS_SELECTOR, f"#{letter} li a"):
            href = a.get_attribute("href")
            if href and "/ville/" in href and href not in seen:
                seen.add(href)
                links.append(href)
    if not links:
        for a in driver.find_elements(By.CSS_SELECTOR, "a[href*='/ville/']"):
            href = a.get_attribute("href")
            if href and href not in seen:
                seen.add(href)
                links.append(href)
    if not links:
        for a in driver.find_elements(By.XPATH, "//*[@id='content']//a[contains(@href,'/ville/')]"):
            href = a.get_attribute("href")
            if href and href not in seen:
                seen.add(href)
                links.append(href)
    return links  # conserve l’ordre


# ---------- Détection type de page ----------
def is_agency_detail_page(html):
    soup = BeautifulSoup(html, "html.parser")
    h1 = soup.find("h1")
    if not h1:
        return False
    t = h1.get_text(" ", strip=True)
    return bool(re.search(r"\bAgence\b\s+\b(bancaire|CA|Crédit Agricole)\b", t, re.I))


# ---------- Coords ----------
def coords_from_dataval_html(li_html: str) -> tuple[str,str]:
    lat = lng = ""
    try:
        li = BeautifulSoup(li_html, "html.parser").find("li")
        if not li: return "",""
        data_val = li.get("data-val")
        if data_val:
            data = json.loads(data_val)
            lat = str(data.get("latitude","") or data.get("lat",""))
            lng = str(data.get("longitude","") or data.get("lng",""))
    except Exception:
        pass
    return lat, lng

def coords_from_jsonld(html) -> tuple[str,str]:
    try:
        soup = BeautifulSoup(html, "html.parser")
        for s in soup.find_all("script", type="application/ld+json"):
            data = json.loads(s.string or "{}")
            if isinstance(data, list):
                for d in data:
                    g = d.get("geo", {})
                    if "latitude" in g and "longitude" in g:
                        return str(g["latitude"]), str(g["longitude"])
            else:
                g = data.get("geo", {})
                if "latitude" in g and "longitude" in g:
                    return str(g["latitude"]), str(g["longitude"])
    except Exception:
        pass
    return "",""


# ---------- Parsing : fiche agence ----------
def parse_agency_detail_html(html, url, region_url, city_hint=""):
    soup = BeautifulSoup(html, "html.parser")

    # Nom (H1)
    h1 = soup.find("h1")
    nom = h1.get_text(" ", strip=True) if h1 else ""

    # --- ADRESSE (structure réelle CA) ---
    # <div class="npc-sl-strct-infos-ctct-adresse">
    #   <p>Cours du 4 Septembre</p>
    #   <p>13390 Auriol</p>
    # </div>
    adresse, cp = "", ""
    addr_div = soup.select_one(".npc-sl-strct-infos-ctct-adresse")
    if addr_div:
        lines = [p.get_text(" ", strip=True) for p in addr_div.select("p") if p.get_text(strip=True)]
        if lines:
            if len(lines) >= 2:
                adresse = f"{lines[0]}, {lines[1]}"
            else:
                adresse = lines[0]
        # CP depuis les lignes de l'adresse
        joined = " ".join(lines)
        m = re.search(r"\b(\d{5})\b", joined)
        if m:
            cp = m.group(1)

    # Fallback si le bloc dédié n’existe pas (vieux templates)
    if not adresse:
        for p in soup.select("h1 ~ p"):
            text = p.get_text(" ", strip=True)
            if re.search(r"\b\d{5}\b", text):
                adresse = text
                m = re.search(r"\b(\d{5})\b", text)
                cp = m.group(1) if m else cp
                break

    # Téléphone (href="tel:")
    tel_a = soup.find("a", href=re.compile(r"^tel:", re.I))
    telephone = clean_phone(tel_a.get_text(" ", strip=True)) if tel_a else ""

    # Email (href="mailto:")
    mail_a = soup.find("a", href=re.compile(r"^mailto:", re.I))
    email = re.sub(r"(?i)^mailto:", "", mail_a.get("href","")).strip() if mail_a else ""

    # Statut (évite “Fermer le bandeau…”)
    statut = ""
    badge = soup.find(string=re.compile(r"\b(Ouvert|Fermé|Fermée)\b", re.I))
    if badge:
        statut = re.search(r"\b(Ouvert|Fermé|Fermée)\b", badge, re.I).group(1).capitalize()

    # Horaires : on garde que les lignes qui commencent par un jour
    horaires = ""
    title = soup.find(string=re.compile(r"Horaires d'ouverture", re.I))
    if title:
        parent = getattr(title, "parent", None)
        container = parent.find_next_sibling() if parent else None
        if container:
            lines = [s.strip() for s in container.get_text("\n", strip=True).splitlines()]
            lines = [s for s in lines if s and looks_like_opening_line(s)]
            if lines:
                horaires = "\n".join(lines)

    return {
        "nom": nom,
        "adresse": adresse,
        "code_postal": cp,
        "telephone": telephone,
        "email": email,
        "statut": statut,
        "horaires": horaires,
        "latitude": "",
        "longitude": "",
        "region_url": region_url,
        "city_url": "",
        "source_url": url,
        "ville": city_hint,
    }



# ---------- Parsing : page ville ----------
def wait_for_agency_cards(driver):
    deadline = time.time() + WAIT
    while time.time() < deadline:
        items = driver.find_elements(By.CSS_SELECTOR, "li.js-storeLoc-agency[data-val]")
        if items:
            return True
        driver.execute_script("window.scrollBy(0, 800);")
        time.sleep(0.25)
    return False

def extract_city_name_from_city_html(html, fallback_url):
    soup = BeautifulSoup(html, "html.parser")
    h = soup.find(["h1","h2"])
    if h:
        t = h.get_text(" ", strip=True)
        m = re.search(r"Nos agences à\s+(.+)", t, re.I)
        if m:
            name = m.group(1).strip(" .")
            if name and "FIN DE CONNEXION" not in name.upper():
                return name
        if t and "FIN DE CONNEXION" not in t.upper():
            return t.strip()
    slug = fallback_url.rstrip("/").split("/")[-1].split("?")[0]
    parts = slug.split("-")
    if parts and re.fullmatch(r"\d{5}", parts[-1]):
        parts = parts[:-1]
    return " ".join(p.capitalize() for p in parts)

def collect_agency_links_from_city(driver):
    out, seen = [], set()
    items = driver.find_elements(By.CSS_SELECTOR, "li.js-storeLoc-agency[data-val]")
    for it in items:
        try:
            a = it.find_element(By.CSS_SELECTOR, "a[href$='.html']")
            href = a.get_attribute("href")
        except Exception:
            continue
        if not href or href in seen or "/agence/" not in href:
            continue
        seen.add(href)
        lat, lng = coords_from_dataval_html(it.get_attribute("outerHTML"))
        out.append({"href": href, "lat": lat, "lng": lng})
    return out

def parse_city_page(driver, city_url, region_url, region_name, seen_agencies, cp):
    """
    Sauvegarde ligne par ligne (dans le fichier de la région).
    Retourne (total_trouves, total_nouveaux, city_name)
    """
    driver.get(city_url)
    WebDriverWait(driver, WAIT).until(EC.presence_of_element_located((By.ID, "content")))
    accept_cookies(driver)
    time.sleep(0.3)
    html = driver.page_source

    # Cas 1 : URL déjà sur une fiche
    if is_agency_detail_page(html):
        city_name = extract_city_name_from_city_html(html, city_url)
        agency_url = city_url
        total_found = 1
        new_cnt = 0

        if agency_url not in seen_agencies:
            ag = parse_agency_detail_html(html, agency_url, region_url, city_hint=city_name)
            lat, lng = coords_from_jsonld(html)
            ag["latitude"], ag["longitude"] = lat, lng
            ag["city_url"] = city_url

            append_one_row_fr_region(ag, region_name)
            seen_agencies.add(agency_url)
            cp["done_agency_urls"].append(agency_url)
            save_checkpoint(cp)
            new_cnt = 1

        return total_found, new_cnt, city_name

    # Cas 2 : vraie page ville → lister les fiches
    wait_for_agency_cards(driver)
    html = driver.page_source
    city_name = extract_city_name_from_city_html(html, city_url)

    detail_links = collect_agency_links_from_city(driver)
    total_found = len(detail_links)
    new_cnt = 0

    for obj in detail_links:
        href, lat_hint, lng_hint = obj["href"], obj["lat"], obj["lng"]
        if href in seen_agencies:
            continue
        try:
            driver.get(href)
            WebDriverWait(driver, WAIT).until(EC.presence_of_all_elements_located((By.TAG_NAME, "h1")))
            time.sleep(0.2)
            detail_html = driver.page_source
            ag = parse_agency_detail_html(detail_html, href, region_url, city_hint=city_name)
            # coords : priorité data-val, sinon JSON-LD
            lat, lng = lat_hint, lng_hint
            if not lat or not lng:
                lat2, lng2 = coords_from_jsonld(detail_html)
                lat = lat or lat2
                lng = lng or lng2
            ag["latitude"], ag["longitude"] = lat, lng
            ag["city_url"] = city_url

            append_one_row_fr_region(ag, region_name)  # <<< un fichier par région
            seen_agencies.add(href)
            cp["done_agency_urls"].append(href)
            save_checkpoint(cp)
            new_cnt += 1

            driver.back()
            WebDriverWait(driver, WAIT).until(EC.presence_of_element_located((By.ID, "content")))
        except TimeoutException:
            continue

    return total_found, new_cnt, city_name


# ---------- Main ----------
def scrape_credit_agricole():
    driver = setup_driver()
    cp = load_checkpoint()
    done_cities   = set(cp.get("done_city_urls", []))
    done_agencies = set(cp.get("done_agency_urls", []))

    # 1) Page liste régions
    driver.get(BASE_LIST_URL)
    WebDriverWait(driver, WAIT).until(EC.presence_of_element_located((By.ID, "content")))
    accept_cookies(driver)
    region_links = get_region_links(driver)
    print(f"[INFO] Régions détectées: {len(region_links)}")

    total_new = 0
    for r_url in region_links:
        # on ouvre la région, lit le nom affiché, et on reste dessus
        region_name = extract_region_name(driver, r_url)
        city_links = get_city_links_from_region(driver, r_url)
        print(f"  - {r_url} ({region_name}) → {len(city_links)} villes")

        region_new_sum = 0
        for c_url in city_links:
            if c_url in done_cities:
                continue
            try:
                total_found, new_cnt, city_name = parse_city_page(
                    driver, c_url, r_url, region_name, done_agencies, cp
                )
            except TimeoutException:
                print(f"    ! Timeout ville: {c_url}")
                continue

            total_new += new_cnt
            region_new_sum += new_cnt

            append_city_count({
                "region_url": r_url,
                "ville_url": c_url,
                "ville": city_name if city_name else "",
                "agences_trouvees_page": total_found,
                "agences_nouvelles_sauvegardees": new_cnt
            })
            print(f"    • {city_name} : {total_found} agences (nouveaux en base: {new_cnt})")

            done_cities.add(c_url)
            cp["done_city_urls"] = list(done_cities)
            save_checkpoint(cp)

        print(f"  = Nouveaux en base sur la région « {region_name} » : {region_new_sum}")

    driver.quit()
    print(f"\n✅ Terminé. Nouvelles agences insérées: {total_new}")


if __name__ == "__main__":
    scrape_credit_agricole()

[INFO] Régions détectées: 39
  - https://www.credit-agricole.fr/professionnel/agence/alpes-provence.html (FIN DE CONNEXION) → 110 villes
    • Aix En Provence 13090.html : 13 agences (nouveaux en base: 13)
    • Allauch 13190.html : 1 agences (nouveaux en base: 1)
    • Apt 84400.html : 1 agences (nouveaux en base: 1)
    • Arles 13200.html : 4 agences (nouveaux en base: 4)


KeyboardInterrupt: 