## COLLECTE DES DONNEES (API ADEME) NEUFS ET EXISTANTS

In [None]:
import os
import time
import requests
import pandas as pd
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import datetime as dt

## CONFIGURATION

In [2]:
DATA_DIR = "../data"
os.makedirs(DATA_DIR, exist_ok=True)

DATASETS = {
    "existants":"https://data.ademe.fr/data-fair/api/v1/datasets/dpe03existant/lines",
    "neufs":"https://data.ademe.fr/data-fair/api/v1/datasets/dpe02neuf/lines",
}

# D√©partement 69* ‚Üí codes postaux commen√ßant par 69
DEPT_CODE = "69"  # c'est le champ √† modifier pour choisir le d√©partement vis√©
CP_PATTERN = f"{DEPT_CODE}*" # sert √† formatter le Code d√©partement pour la requ√™te API
YEARS = range(2021, 2026)   # p√©riode test
PAGE_SIZE = 1500

OUT = {
    "existants": os.path.join(DATA_DIR,f"donnees_dpe_existants_{DEPT_CODE}.csv"),
    "neufs":     os.path.join(DATA_DIR,f"donnees_dpe_neufs_{DEPT_CODE}.csv"),
}


## Session

In [3]:
session = requests.Session()
retries = Retry(
    total=5,
    connect=3,
    read=3,
    backoff_factor=0.6,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
)
session.mount("https://", HTTPAdapter(max_retries=retries))

## FONCTIONS UTILITAIRES

In [None]:
def fetch_first_page(base_url: str, year: int):
    """R√©cup√®re la premi√®re page pour l'ann√©e donn√©e."""
    borne1 = f"{year}-01-01"
    borne2 = f"{year}-12-31"
    params = {
        "size": PAGE_SIZE,
        "sort":"date_reception_dpe", # tri par date croissante
        "q": CP_PATTERN,
        "q_fields": "code_postal_ban",
        "qs": f"date_reception_dpe:[{borne1} TO {borne2}]",
    }
    r = session.get(base_url, params=params, timeout=60)
    r.raise_for_status()
    return r.json()

def fetch_next_page(next_url: str):
    """R√©cup√®re la page suivante √† partir du champ 'next'."""
    r = session.get(next_url, timeout=60)
    r.raise_for_status()
    return r.json()

def append_to_csv(df: pd.DataFrame, path: str, header_manager: dict):
    """√âcrit ou ajoute au CSV en respectant le sch√©ma initial."""
    if header_manager.get("columns") is None:
        header_manager["columns"] = list(df.columns)
        df = df.reindex(columns=header_manager["columns"])
        df.to_csv(path, index=False, mode="w", header=True)
    else:
        df = df.reindex(columns=header_manager["columns"])
        df.to_csv(path, index=False, mode="a", header=False)

## COLLECTE PRINCIPALE

In [None]:
def collect_dpe(label: str, base_url: str, out_csv: str):
    header_manager = {"columns": None}
    total_rows_written = 0
    print(f"\n=== COLLECTE [{label.upper()}] ===")

    # --- V√©rification existence du fichier ---
    if os.path.exists(out_csv):
        try:
            existing_df = pd.read_csv(out_csv, nrows=1)
            header_manager["columns"] = list(existing_df.columns)
            print(f"[INFO] Fichier existant d√©tect√© : les nouvelles donn√©es seront ajout√©es √† la suite ({out_csv}).")
        except Exception as e:
            print(f"[WARN] Impossible de lire le fichier existant ({e}), il sera recr√©√©.")

    for year in YEARS:
        print(f"\n--- Ann√©e {year} ---")
        start_time = time.time()
        page = 1
        js = fetch_first_page(base_url, year)
        next_url = js.get("next")

        while True:
            results = js.get("results", [])
            if not results:
                print(f"[INFO] Aucune donn√©e pour {year}, page {page}")
                break

            df_page = pd.DataFrame(results)
            append_to_csv(df_page, out_csv, header_manager)
            total_rows_written += len(df_page)

            # --- Estimation du temps total apr√®s la premi√®re page ---
            if page == 1:
                elapsed = time.time() - start_time  # start_time d√©fini avant la boucle ann√©e
                if js.get("total"):
                    estimated_total_time = (js["total"] / PAGE_SIZE) * elapsed
                    print(f"[INFO] Estimation de dur√©e pour {year}: {dt.timedelta(seconds=int(estimated_total_time))}")

            print(f"Page {page:>3} | lignes: {len(df_page):>4} | total cumul√©: {total_rows_written:,}")

            if not next_url:
                break  # plus de pages

            # Pause douce pour √©viter le throttling
            time.sleep(0.3)

            # Page suivante
            js = fetch_next_page(next_url)
            next_url = js.get("next")
            page += 1

    print(f"\n‚úÖ Termin√© [{label}] : {out_csv} | {total_rows_written:,} lignes totales.\n")

## EXECUTION

In [None]:
#neufs
collect_dpe("neufs",DATASETS["neufs"],OUT["neufs"])
print(f"\nüéØ Collecte neufs ({DEPT_CODE}) termin√©e")


=== COLLECTE [NEUFS] ===

--- Ann√©e 2021 ---
Page   1 | lignes: 1500 | total cumul√©: 1,500
Page   2 | lignes: 1500 | total cumul√©: 3,000
Page   3 | lignes: 1262 | total cumul√©: 4,262

--- Ann√©e 2022 ---
Page   1 | lignes: 1500 | total cumul√©: 5,762
Page   2 | lignes: 1500 | total cumul√©: 7,262
Page   3 | lignes: 1500 | total cumul√©: 8,762
Page   4 | lignes: 1500 | total cumul√©: 10,262
Page   5 | lignes: 1500 | total cumul√©: 11,762
Page   6 | lignes: 1500 | total cumul√©: 13,262
Page   7 | lignes: 1500 | total cumul√©: 14,762
Page   8 | lignes: 1500 | total cumul√©: 16,262
Page   9 | lignes:  547 | total cumul√©: 16,809

--- Ann√©e 2023 ---
Page   1 | lignes: 1500 | total cumul√©: 18,309
Page   2 | lignes: 1500 | total cumul√©: 19,809
Page   3 | lignes: 1500 | total cumul√©: 21,309
Page   4 | lignes: 1500 | total cumul√©: 22,809
Page   5 | lignes: 1500 | total cumul√©: 24,309
Page   6 | lignes: 1500 | total cumul√©: 25,809
Page   7 | lignes: 1083 | total cumul√©: 26,892

--- 

In [None]:
#existants
collect_dpe("existants",DATASETS["existants"],OUT["existants"])

print(f"\nüéØ Collecte existants ({DEPT_CODE}) termin√©e")


=== COLLECTE [EXISTANTS] ===

--- Ann√©e 2021 ---
Page   1 | lignes: 1500 | total cumul√©: 1,500
Page   2 | lignes: 1500 | total cumul√©: 3,000
Page   3 | lignes: 1500 | total cumul√©: 4,500
Page   4 | lignes: 1500 | total cumul√©: 6,000
Page   5 | lignes: 1500 | total cumul√©: 7,500
Page   6 | lignes: 1500 | total cumul√©: 9,000
Page   7 | lignes: 1500 | total cumul√©: 10,500
Page   8 | lignes: 1500 | total cumul√©: 12,000
Page   9 | lignes: 1500 | total cumul√©: 13,500
Page  10 | lignes: 1500 | total cumul√©: 15,000
Page  11 | lignes: 1500 | total cumul√©: 16,500
Page  12 | lignes: 1500 | total cumul√©: 18,000
Page  13 | lignes: 1500 | total cumul√©: 19,500
Page  14 | lignes:  804 | total cumul√©: 20,304

--- Ann√©e 2022 ---
Page   1 | lignes: 1500 | total cumul√©: 21,804
Page   2 | lignes: 1500 | total cumul√©: 23,304
Page   3 | lignes: 1500 | total cumul√©: 24,804
Page   4 | lignes: 1500 | total cumul√©: 26,304
Page   5 | lignes: 1500 | total cumul√©: 27,804
Page   6 | lignes: 150

ChunkedEncodingError: ("Connection broken: ConnectionResetError(10054, 'Une connexion existante a d√ª √™tre ferm√©e par l‚Äôh√¥te distant', None, 10054, None)", ConnectionResetError(10054, 'Une connexion existante a d√ª √™tre ferm√©e par l‚Äôh√¥te distant', None, 10054, None))

In [18]:
# V√©rif nb lignes + v√©rif ANNEE

df_exist = pd.read_csv(f"../data/donnees_dpe_existants_{DEPT_CODE}.csv")
df_neuf = pd.read_csv(f"../data/donnees_dpe_neufs_{DEPT_CODE}.csv")

print("Existants :", df_exist.shape)
print("Neufs :", df_neuf.shape)

# V√©rifier les ann√©es couvertes
print("\nAnn√©es existants :", df_exist["date_reception_dpe"].str[:4].value_counts().sort_index())
print("\nAnn√©es neufs :", df_neuf["date_reception_dpe"].str[:4].value_counts().sort_index())


  df_exist = pd.read_csv(f"../data/donnees_dpe_existants_{DEPT_CODE}.csv")


Existants : (81389, 214)
Neufs : (40277, 135)

Ann√©es existants : date_reception_dpe
2025    81389
Name: count, dtype: int64

Ann√©es neufs : date_reception_dpe
2021     4262
2022    12547
2023    10083
2024     9126
2025     4259
Name: count, dtype: int64


  df_neuf = pd.read_csv(f"../data/donnees_dpe_neufs_{DEPT_CODE}.csv")


## Relance sur ANNEE, en cas de plantage pour dpe existant

In [None]:
# Ann√©es √† relancer
YEARS = [2025]  # ou plusieurs : [2023, 2025]

# Nettoyer les lignes d√©j√† pr√©sentes dans le CSV, pour ces ann√©es
path = f"../data/donnees_dpe_existants_{DEPT_CODE}.csv"
df = pd.read_csv(path)

print("Avant :", len(df))

# Convertir les ann√©es en cha√Ænes et filtrer dynamiquement
years_str = [str(y) for y in YEARS]
df = df[~df["date_reception_dpe"].astype(str).str[:4].isin(years_str)]

print("Apr√®s suppression ann√©es", YEARS, ":", len(df))

# Sauvegarde du fichier nettoy√©
df.to_csv(path, index=False)
print("‚úÖ Fichier nettoy√©, pr√™t pour re-collecte", YEARS)

# Ex√©cution relance
collect_dpe("existants", DATASETS["existants"], OUT["existants"])


  df = pd.read_csv(path)


Avant : 383779
Apr√®s suppression 2025 : 349279
‚úÖ Fichier nettoy√©, pr√™t pour re-collecte 2025


In [None]:
path = f"../data/donnees_dpe_existants_{DEPT_CODE}.csv"
df = pd.read_csv(path)
print("Total lignes finale : ",len(df))

  df = pd.read_csv(path)


Total lignes finale 81389


## Test Size Requ√™tes

In [15]:
#test size requetes

import time
import requests

BASE_URL = "https://data.ademe.fr/data-fair/api/v1/datasets/dpe03existant/lines"
params_template = {
    "q": "69*",
    "q_fields": "code_postal_ban",
    "qs": "date_reception_dpe:[2022-01-01 TO 2022-12-31]"
}

sizes = [500, 1000, 2000, 5000, 10000, 11000]  # tailles √† tester

for size in sizes:
    params = dict(params_template)
    params["size"] = size

    print(f"\n--- Test size={size} ---")
    start = time.time()
    r = requests.get(BASE_URL, params=params, timeout=120)
    duration = time.time() - start

    if r.status_code != 200:
        print(f"Erreur {r.status_code} : {r.text[:300]}")
        continue

    js = r.json()
    n_results = len(js.get("results", []))
    total = js.get("total", "N/A")

    print(f"Dur√©e : {duration:.2f}s | Lignes retourn√©es : {n_results} | Total annonc√© : {total}")


--- Test size=500 ---


KeyboardInterrupt: 

In [None]:
#test size requetes

import time
import requests

BASE_URL = "https://data.ademe.fr/data-fair/api/v1/datasets/dpe03existant/lines"
params_template = {
    "q": "69*",
    "q_fields": "code_postal_ban",
    "qs": "date_reception_dpe:[2022-01-01 TO 2022-12-31]"
}

sizes = [1000, 1100, 1200, 1300, 1400,1500, 1600, 1700, 1800, 1900, 2000]  # tailles √† tester

for size in sizes:
    params = dict(params_template)
    params["size"] = size

    print(f"\n--- Test size={size} ---")
    start = time.time()
    r = requests.get(BASE_URL, params=params, timeout=120)
    duration = time.time() - start

    if r.status_code != 200:
        print(f"Erreur {r.status_code} : {r.text[:300]}")
        continue

    js = r.json()
    n_results = len(js.get("results", []))
    total = js.get("total", "N/A")

    print(f"Dur√©e : {duration:.2f}s | Lignes retourn√©es : {n_results} | Total annonc√© : {total}")
