Imports et réglages

In [1]:
import pandas as pd
import re
from pathlib import Path

Chargement des données

In [3]:
BASE_DIR = Path.cwd().parent   # si tu lances bien le notebook depuis le dossier Notebooks

# Chemin vers DATA/annonceclean.csv
csv_path = BASE_DIR / "DATA" / "annonceclean.csv"

df = pd.read_csv(csv_path)
print("Shape initiale :", df.shape)
df.head()

Shape initiale : (8284, 9)


Unnamed: 0,prix,surface,pieces,adresse,type_bien,sous_type,departement_nom,departement_code,prix_m2
0,983500,105.0,4.0,Aix-en-Provence (13100),Appartement,Appartement,Bouches-du-Rhône,13,9366.666667
1,950500,128.0,5.0,Aix-en-Provence (13100),Appartement,Appartement,Bouches-du-Rhône,13,7425.78125
2,942100,108.0,4.0,Marseille 8ème (13008),Appartement,Appartement,Bouches-du-Rhône,13,8723.148148
3,890000,102.0,4.0,"Mazenod-Bellevue-Saint Donat-Banon, Aix-en-Pro...",Appartement,Appartement,Bouches-du-Rhône,13,8725.490196
4,882000,110.4,4.0,"Mazarin-Opéra-Bellegarde-Mairie, Aix-en-Proven...",Appartement,Appartement,Bouches-du-Rhône,13,7989.130435


Verif valeurs manquantes

In [4]:
df.isna().sum()

prix                 0
surface              0
pieces              10
adresse              0
type_bien            0
sous_type            0
departement_nom      0
departement_code     0
prix_m2              0
dtype: int64

Sécurisation numérique + recalcul prix/m²

In [200]:
df["prix_num"] = pd.to_numeric(df["prix"], errors="coerce")
df["surface_num"] = pd.to_numeric(df["surface"], errors="coerce")

df = df.dropna(subset=["prix_num", "surface_num"]).copy()

df["prix_m2"] = df["prix_num"] / df["surface_num"]

df[["prix_num", "surface_num", "prix_m2"]].describe()

Unnamed: 0,prix_num,surface_num,prix_m2
count,8284.0,8284.0,8284.0
mean,306692.3,95.634392,3688.158124
std,272473.5,61.243011,2444.459627
min,21000.0,10.5,110.701107
25%,169999.8,60.0,1986.175017
50%,245000.0,80.0,3459.570495
75%,360000.0,114.0,4677.660809
max,5400000.0,495.0,40900.0


Filtrage métier sur prix/m²

In [201]:
df = df[
    (df["prix"] > 1000) & (df["prix"] < 10_000_000) &
    (df["surface"] >= 10) & (df["surface"] <= 500) &
    (df["prix_m2"] <= 28000)
]

if "pieces" in df.columns:
    df = df[(df["pieces"].isna()) | (df["pieces"] <= 20)]

print("Shape finale :", df.shape)

Shape finale : (8275, 11)


In [202]:
df["type_bien"].value_counts()[df["type_bien"].value_counts() < 10]

type_bien
Terrain    1
Villa      1
Name: count, dtype: int64

In [203]:
# Exclure les terrains
df = df[df["type_bien"] != "Terrain"]

# Regrouper Villa → Maison
df["type_bien"] = df["type_bien"].replace({"Villa": "Maison"})


Extraction ville et CP depuis adresse

In [204]:
df["Ville"] = (
    df["adresse"]
    .str.replace(r"\s*\(\d{5}\)", "", regex=True)
    .str.strip()
)

df["Code_postal"] = df["adresse"].str.extract(r"\((\d{5})\)", expand=False)
df["Code_postal"] = df["Code_postal"].astype("string")

df[["adresse", "Ville", "Code_postal"]].head(10)

Unnamed: 0,adresse,Ville,Code_postal
0,Aix-en-Provence (13100),Aix-en-Provence,13100
1,Aix-en-Provence (13100),Aix-en-Provence,13100
2,Marseille 8ème (13008),Marseille 8ème,13008
3,"Mazenod-Bellevue-Saint Donat-Banon, Aix-en-Pro...","Mazenod-Bellevue-Saint Donat-Banon, Aix-en-Pro...",13090
4,"Mazarin-Opéra-Bellegarde-Mairie, Aix-en-Proven...","Mazarin-Opéra-Bellegarde-Mairie, Aix-en-Provence",13080
5,"Vieille Chapelle, Marseille 8ème (13008)","Vieille Chapelle, Marseille 8ème",13008
6,"Endoume, Marseille 7ème (13007)","Endoume, Marseille 7ème",13007
7,"Opéra, Marseille 1er (13001)","Opéra, Marseille 1er",13001
8,"Mazenod-Bellevue-Saint Donat-Banon, Aix-en-Pro...","Mazenod-Bellevue-Saint Donat-Banon, Aix-en-Pro...",13100
9,"Sud-Universités, Aix-en-Provence (13090)","Sud-Universités, Aix-en-Provence",13090


Extraction Ville & Code postal depuis l’adresse

In [205]:
PARASITES = [
    "RDC", "Gare", "Zone", "Quartier", "Secteur",
    "Ouest", "Est", "Nord", "Sud",
    "Urbaine", "Industrielle", "Forêt",
    "Proche", "Centre", "Ville",
    "Centre-Ville", "Centre ville"
]

parasites_pattern = r"\b(" + "|".join(re.escape(p) for p in PARASITES) + r")\b"

def _ville_from_text(text):
    # Ville = après la dernière virgule si possible, sinon tout le texte
    if "," in text:
        return text.split(",")[-1].strip()
    return text.strip()

def extract_ville_arrondissement(adresse):
    if pd.isna(adresse):
        return pd.Series([None, None])

    adresse = str(adresse)

    # Code postal
    cp_match = re.search(r"\((\d{5})\)", adresse)
    cp = cp_match.group(1) if cp_match else None

    # Nettoyage texte (sans CP)
    base = re.sub(r"\s*\(\d{5}\)\s*", "", adresse).strip()
    base = re.sub(parasites_pattern, "", base, flags=re.IGNORECASE)
    base = re.sub(r"\s+", " ", base).strip()

    ville = _ville_from_text(base)
    arrondissement = None

    # Forçage strict via CP (UNIQUEMENT les arrondissements)
    if cp:
        cp_int = int(cp)
        if 75001 <= cp_int <= 75020:
            ville = "Paris"
        elif 69001 <= cp_int <= 69009:
            ville = "Lyon"
        elif 13001 <= cp_int <= 13016:
            ville = "Marseille"

    # Arrondissement uniquement pour ces 3 villes
    if ville in ["Paris", "Lyon", "Marseille"]:
        m = re.search(r"\b(\d+)\s*(?:er|ème|eme)\b", adresse, re.IGNORECASE)
        if m:
            arrondissement = int(m.group(1))

    # Sécurités bornes (uniquement si arrondissement détecté)
    if ville == "Paris" and (arrondissement is None or not (1 <= arrondissement <= 20)):
        arrondissement = None
    if ville == "Lyon" and (arrondissement is None or not (1 <= arrondissement <= 9)):
        arrondissement = None
    if ville == "Marseille" and (arrondissement is None or not (1 <= arrondissement <= 16)):
        arrondissement = None

    return pd.Series([ville, arrondissement])

df[["Ville", "Arrondissement"]] = df["adresse"].apply(extract_ville_arrondissement)

In [206]:
def clean_commune(v):
    if pd.isna(v):
        return v

    v = str(v).strip()

    # enlever tirets parasites au début
    v = re.sub(r"^[\-\–\—]+\s*", "", v)

    mots = v.split()
    if len(mots) == 1:
        return v

    prefixes = {"saint", "sainte", "st", "ste", "le", "la", "les", "l'", "l’"}

    # villes composées classiques
    if mots[-2].lower() in prefixes:
        return " ".join(mots[-2:])

    # ex: Saint-Dizier
    if "-" in mots[-1]:
        return mots[-1]

    # sinon dernier mot = commune
    return mots[-1]

df["Ville"] = df["Ville"].apply(clean_commune)

In [207]:
dep_str = df["departement_code"].astype("string")
dep_for_cp = dep_str.replace({"2A": "20", "2B": "20"})

df["Code_postal"] = df["Code_postal"].fillna(
    dep_for_cp.str.zfill(2) + "000"
)

df[["Ville", "Code_postal"]].isna().sum()

Ville          0
Code_postal    0
dtype: int64

In [208]:
df = df.drop(columns=["prix", "surface"])

df["Code_postal"] = df["Code_postal"].astype(str).str.zfill(5)

Correction par code postale

In [209]:
df = df.rename(columns={
    "prix_num": "prix",
    "surface_num": "surface"
})

In [210]:
colonnes_utiles = [
    "type_bien",
    "sous_type",
    "Ville",
    "Arrondissement",
    "Code_postal",
    "departement_nom",
    "departement_code",
    "prix",
    "surface",
    "prix_m2",
    "pieces"
]

df_analyse = df[colonnes_utiles].copy()
df_analyse.head()

Unnamed: 0,type_bien,sous_type,Ville,Arrondissement,Code_postal,departement_nom,departement_code,prix,surface,prix_m2,pieces
0,Appartement,Appartement,Aix-en-Provence,,13100,Bouches-du-Rhône,13,983500,105.0,9366.666667,4.0
1,Appartement,Appartement,Aix-en-Provence,,13100,Bouches-du-Rhône,13,950500,128.0,7425.78125,5.0
2,Appartement,Appartement,Marseille,8.0,13008,Bouches-du-Rhône,13,942100,108.0,8723.148148,4.0
3,Appartement,Appartement,Aix-en-Provence,,13090,Bouches-du-Rhône,13,890000,102.0,8725.490196,4.0
4,Appartement,Appartement,Aix-en-Provence,,13080,Bouches-du-Rhône,13,882000,110.4,7989.130435,4.0


In [211]:
df_analyse.to_csv("df_analyseVF4.csv", index=False, encoding="utf-8")
print("✅ df_analyse.csv créé")

✅ df_analyse.csv créé
