In [11]:
from pathlib import Path

print("CWD =", Path.cwd())
print("Contenu CWD :", [p.name for p in Path.cwd().iterdir()][:20])

CWD = /home/debian/Documents/data ops/main
Contenu CWD : ['01.ipynb']


In [12]:
from pathlib import Path

project = Path.cwd()

# On remonte jusqu'à trouver un dossier "data/raw"
while project != project.parent and not (project / "data" / "raw").exists():
    project = project.parent

print("PROJECT =", project)
print("data/raw existe ?", (project / "data" / "raw").exists())

PROJECT = /home/debian/Documents/data ops
data/raw existe ? True


In [13]:
import pandas as pd
from pathlib import Path

file_path = project / "data" / "raw" / "customers_dirty.csv"
print("Fichier =", file_path)
print("Existe ?", file_path.exists())

df = pd.read_csv(file_path)
df.head()

Fichier = /home/debian/Documents/data ops/data/raw/customers_dirty.csv
Existe ? True


Unnamed: 0,customer_id,full_name,email,signup_date,country,age,last_purchase_amount,loyalty_tier
0,3001,Jean Morel,jean.morel@example.com,2025-01-10,FR,42.0,120.5,GOLD
1,3002,Alice Petit,alice.petitexample.com,2025-01-12,FR,35.0,80.0,SILVER
2,3003,Carlos Diaz,carlos.diaz@example.com,not_a_date,ES,29.0,60.0,GOLD
3,3004,Lucie Bernard,lucie.bernard@example.com,2025-02-01,fr,17.0,25.0,BRONZE
4,3005,Mohamed Ali,mohamed.ali@example.com,2025-02-10,DE,31.0,-10.0,SILVER


In [None]:
print("Lignes / colonnes :", df.shape)

print("\nTypes :")
print(df.dtypes)

print("\nManquants par colonne :")
print(df.isna().sum().sort_values(ascending=False))

print("\nDoublons (lignes identiques) :", df.duplicated().sum())

# Si tu as une colonne email
if "email" in df.columns:
    print("Doublons email :", df["email"].duplicated().sum())

Lignes / colonnes : (10, 8)

Types :
customer_id               int64
full_name                   str
email                       str
signup_date                 str
country                     str
age                     float64
last_purchase_amount    float64
loyalty_tier                str
dtype: object

Manquants par colonne :
age                     1
customer_id             0
full_name               0
email                   0
signup_date             0
country                 0
last_purchase_amount    0
loyalty_tier            0
dtype: int64

Doublons (lignes identiques) : 0
Doublons email : 1


In [15]:
import pandas as pd

DOMAINS = ["example.com", "gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]

def clean_customers_df(df: pd.DataFrame) -> pd.DataFrame:
    df_clean = df.copy()

    # Nettoie les noms de colonnes (évite "full_name " avec espaces)
    df_clean.columns = df_clean.columns.astype(str).str.strip()

    # EMAIL
    def fix_email_simple(x):
        s = str(x).strip().lower()
        if s in ("", "nan", "none"):
            return pd.NA
        if "@" in s:
            return s
        for dom in DOMAINS:
            if dom in s:
                return s.replace(dom, "@" + dom, 1)
        return pd.NA

    if "email" in df_clean.columns:
        df_clean["email"] = df_clean["email"].apply(fix_email_simple)

    # FULL NAME : doit être exactement 2 mots "xxx xxx"
    if "full_name" in df_clean.columns:
        name = df_clean["full_name"].fillna("").astype(str).str.strip()
        mask_fullname_ok = name.str.match(r"^\S+\s+\S+$")
        df_clean = df_clean[mask_fullname_ok].copy()

    # SIGNUP_DATE : doit commencer par "2025" + être une date valide
    if "signup_date" in df_clean.columns:
        signup_str = df_clean["signup_date"].fillna("").astype(str).str.strip()
        df_clean = df_clean[signup_str.str.startswith("2025")].copy()

        df_clean["signup_date"] = pd.to_datetime(df_clean["signup_date"], errors="coerce")
        df_clean = df_clean[df_clean["signup_date"].notna()].copy()

    # AGE : entier entre 16 et 100 sinon on supprime la ligne
    if "age" in df_clean.columns:
        age_str = df_clean["age"].fillna("").astype(str).str.replace(",", ".", regex=False)
        age_num = pd.to_numeric(age_str, errors="coerce")

        mask_age_ok = age_num.notna() & (age_num % 1 == 0) & (age_num >= 16) & (age_num <= 100)
        df_clean = df_clean[mask_age_ok].copy()
        df_clean["age"] = age_num[mask_age_ok].astype("Int64")

    # LAST_PURCHASE_AMOUNT : si négatif -> supprime la ligne
    if "last_purchase_amount" in df_clean.columns:
        amt_str = df_clean["last_purchase_amount"].fillna("").astype(str).str.replace(",", ".", regex=False)
        amt = pd.to_numeric(amt_str, errors="coerce")

        df_clean = df_clean[~(amt < 0)].copy()
        df_clean["last_purchase_amount"] = amt

    # COUNTRY : 2 premières lettres en maj
    if "country" in df_clean.columns:
        df_clean["country"] = df_clean["country"].fillna("").astype(str).str.strip().str.upper().str[:2]

    # DEDOUBLONNAGE PAR EMAIL
    if "email" in df_clean.columns:
        df_clean = df_clean.drop_duplicates(subset=["email"], keep="first")

    return df_clean.reset_index(drop=True)

In [16]:
import pandas as pd
from pathlib import Path

# 1) Trouver la racine du projet (dossier qui contient data/raw)
project = Path.cwd()
while project != project.parent and not (project / "data" / "raw").exists():
    project = project.parent

print("PROJECT =", project)

RAW_DIR = project / "data" / "raw"
CLEAN_DIR = project / "data" / "clean"
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

files = [
    "customers_dirty.csv",
    "customers_dirty2.csv",
    "customers_dirty3.csv",
]

for i, fname in enumerate(files, start=1):
    in_path = RAW_DIR / fname
    out_path = CLEAN_DIR / f"customers_clean_{i}.csv"

    df = pd.read_csv(in_path)
    df_clean = clean_customers_df(df)   # ta fonction
    df_clean.to_csv(out_path, index=False)

    print(f"OK {fname} -> {out_path.name} | {len(df)} -> {len(df_clean)} lignes")

PROJECT = /home/debian/Documents/data ops
OK customers_dirty.csv -> customers_clean_1.csv | 10 -> 5 lignes
OK customers_dirty2.csv -> customers_clean_2.csv | 10 -> 5 lignes
OK customers_dirty3.csv -> customers_clean_3.csv | 20 -> 10 lignes
