# Cadre de validation de la qualit√© des donn√©es (6 piliers)

Ce notebook met en ≈ìuvre une suite compl√®te de validations avec **Great Expectations v0.18+** sur les fichiers CSV de sant√©.
Toutes les cellules, commentaires et messages sont en fran√ßais.

**Piliers couverts :** compl√©tude, exactitude, validit√©, coh√©rence, unicit√©, actualit√©.

**Sorties :** r√©sultats dans `./gx/` et rapport HTML dans `./reports/`.

In [174]:
# -*- coding: utf-8 -*-
"""
Validation Qualit√© des Donn√©es ‚Äì Great Expectations v1.11.3
Compatible avec l'API moderne (v1.x)
"""

import os
import pandas as pd
import great_expectations as gx
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

# Chemins
DATA_DIR = "./data"
GX_DIR = "./gx"
REPORTS_DIR = "./reports"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(GX_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

print("‚úÖ Dossiers cr√©√©s.")

‚úÖ Dossiers cr√©√©s.


In [175]:
# Cr√©er un contexte persistant dans ./gx
context = gx.get_context()

# Configurer un site Data Docs avec un chemin ABSOLU
context.update_data_docs_site(
    site_name="local_site",
    site_config={
        "class_name": "SiteBuilder",
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "base_directory": os.path.abspath(os.path.join(REPORTS_DIR, "gx_data_docs")),
        },
        "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
    }
)

print("‚úÖ Contexte GX initialis√© (v1.11.3).")

‚úÖ Contexte GX initialis√© (v1.11.3).


In [176]:
def load_csv(filename):
    path = os.path.join(DATA_DIR, filename)
    if not os.path.exists(path):
        print(f"‚ö†Ô∏è  {filename} manquant")
        return pd.DataFrame()
    try:
        df = pd.read_csv(path, dtype=str, na_filter=False)
        
        # üîπ Conversion des dates pour patients.csv
        if filename == "patients.csv":
            # Convertir les colonnes de date en datetime
            for col in ["arrival_date", "departure_date", "date_naissance"]:
                if col in df.columns:
                    df[col] = pd.to_datetime(df[col], errors="coerce")
        
        # üîπ Conversion pour consultations.csv (si besoin)
        elif filename == "consultations.csv":
            if "consultation_date" in df.columns:
                df["consultation_date"] = pd.to_datetime(df["consultation_date"], errors="coerce")
        
        # üîπ (Optionnel) Conversion pour staff.csv
        elif filename == "staff.csv":
            if "date_naissance" in df.columns:
                df["date_naissance"] = pd.to_datetime(df["date_naissance"], errors="coerce")

        return df
    except Exception as e:
        print(f"‚ùå Erreur chargement {filename}: {e}")
        return pd.DataFrame()

# Charger les fichiers
staff_df = load_csv("staff.csv")
patients_df = load_csv("patients.csv")
consultations_df = load_csv("consultations.csv")
schedule_df = load_csv("staff_schedule.csv")
services_df = load_csv("services_weekly.csv")

In [177]:
# Sauvegarder les DataFrames localement pour GX
staff_df.to_csv(os.path.join(GX_DIR, "staff.csv"), index=False)
patients_df.to_csv(os.path.join(GX_DIR, "patients.csv"), index=False)
consultations_df.to_csv(os.path.join(GX_DIR, "consultations.csv"), index=False)
schedule_df.to_csv(os.path.join(GX_DIR, "schedule.csv"), index=False)
services_df.to_csv(os.path.join(GX_DIR, "services.csv"), index=False)

# Ajouter une source de donn√©es locale (API v3)
datasource = context.data_sources.add_pandas(name="local_files")

# Cr√©er des Data Assets
staff_asset = datasource.add_csv_asset(
    name="staff",
    filepath_or_buffer=os.path.join(GX_DIR, "staff.csv")
)

patients_asset = datasource.add_csv_asset(
    name="patients",
    filepath_or_buffer=os.path.join(GX_DIR, "patients.csv")
)

consultations_asset = datasource.add_csv_asset(
    name="consultations",
    filepath_or_buffer=os.path.join(GX_DIR, "consultations.csv")
)

schedule_asset = datasource.add_csv_asset(
    name="schedule",
    filepath_or_buffer=os.path.join(GX_DIR, "schedule.csv")
)

services_asset = datasource.add_csv_asset(
    name="services",
    filepath_or_buffer=os.path.join(GX_DIR, "services.csv")
)

print("‚úÖ Data Assets cr√©√©s.")


‚úÖ Data Assets cr√©√©s.


In [203]:
# R√©initialiser results
results = []

# Liste des combinaisons (asset, suite) √† valider
validations_to_run = [
    ("staff", "completude_staff"),
    ("patients", "completude_patients"),
    ("consultations", "completude_consultations"),
    ("staff", "exactitude_staff"),
    ("patients", "exactitude_patients"),
    ("schedule", "exactitude_schedule"),
    ("staff", "validite_staff"),
    ("patients", "validite_patients"),
    ("services", "validite_services"),
    ("staff", "unicite_staff"),
    ("patients", "unicite_patients"),
    ("schedule", "unicite_schedule"),
    ("patients", "actualite_patients"),  # ‚úÖ D√©comment√© !
]

for asset_name, suite_name in validations_to_run:
    df = assets[asset_name]
    if df.empty:
        continue
        
    try:
        # R√©cup√©rer la suite existante
        suite = context.suites.get(suite_name)
        
        # Cr√©er le batch request
        batch_request = locals()[f"{asset_name}_asset"].build_batch_request()
        
        # Cr√©er le validator
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite=suite
        )
        
        # Ex√©cuter la validation
        result = validator.validate()
        success = result.success
        
        # Extraire le pilier depuis le nom de la suite
        pilier = suite_name.split("_")[0]
        
        results.append({
            "Pilier": pilier,
            "Dataset": asset_name,
            "Succ√®s": success
        })
        
        print(f"‚úÖ Validation r√©ussie : {asset_name} / {pilier}")
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Erreur validation {asset_name}/{suite_name}: {e}")

# G√©n√©rer le rapport HTML
context.build_data_docs()
html_path = os.path.abspath(os.path.join(REPORTS_DIR, "gx_data_docs/index.html"))
print(f"\nüìÑ Rapport HTML g√©n√©r√© : {html_path}")

# Ouvrir dans le navigateur
import webbrowser
webbrowser.open(f"file://{html_path}")

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : staff / completude


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : patients / completude


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : consultations / completude


Calculating Metrics:   0%|          | 0/18 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : staff / exactitude


Calculating Metrics:   0%|          | 0/18 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : patients / exactitude


Calculating Metrics:   0%|          | 0/16 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : schedule / exactitude


Calculating Metrics:   0%|          | 0/26 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : staff / validite


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : patients / validite


Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : services / validite


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : staff / unicite


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : patients / unicite


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

‚úÖ Validation r√©ussie : schedule / unicite
‚ö†Ô∏è  Erreur validation patients/actualite_patients: ExpectationSuite with name actualite_patients was not found.

üìÑ Rapport HTML g√©n√©r√© : c:\Users\ababid\Desktop\data gouv\reports\gx_data_docs\index.html


True

In [204]:
print("\nüîç Premier √©l√©ment de results :", results[0] if results else "vide")
df_results = pd.DataFrame(results)
print("Colonnes du DataFrame :", df_results.columns.tolist())


üîç Premier √©l√©ment de results : {'Pilier': 'completude', 'Dataset': 'staff', 'Succ√®s': True}
Colonnes du DataFrame : ['Pilier', 'Dataset', 'Succ√®s']


In [205]:
if results:
    print("Cl√©s dans le premier r√©sultat :", list(results[0].keys()))
    df_results = pd.DataFrame(results)
    print("Colonnes du DataFrame :", df_results.columns.tolist())
else:
    print("‚ö†Ô∏è  Aucune validation ex√©cut√©e.")

if results:
    df_results = pd.DataFrame(results)
    
    # Calcul des scores moyens par pilier
    scores = df_results.groupby("Pilier")["Succ√®s"].mean().sort_values(ascending=False) * 100
    
    print("\nüìä SYNTH√àSE DES PILIERS DE QUALIT√â")
    print("=" * 50)
    for pilier, score in scores.items():
        # Barre proportionnelle (max 10 blocs)
        bar_length = min(int(score // 10), 10)
        bar = "‚ñà" * bar_length + "‚ñë" * (10 - bar_length)
        print(f"{pilier:12} | {score:5.1f}% | {bar}")
    
    # Score global
    score_global = df_results["Succ√®s"].mean() * 100
    print("-" * 50)
    print(f"{'GLOBAL':12} | {score_global:5.1f}% | {'='*10}")
    
else:
    print("‚ö†Ô∏è  Aucune validation ex√©cut√©e.")

print("\n‚úÖ Validation termin√©e. Rapport HTML disponible dans ./reports/")

Cl√©s dans le premier r√©sultat : ['Pilier', 'Dataset', 'Succ√®s']
Colonnes du DataFrame : ['Pilier', 'Dataset', 'Succ√®s']

üìä SYNTH√àSE DES PILIERS DE QUALIT√â
completude   |  66.7% | ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë
validite     |  66.7% | ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë
exactitude   |  33.3% | ‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë
unicite      |  33.3% | ‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë
--------------------------------------------------

‚úÖ Validation termin√©e. Rapport HTML disponible dans ./reports/
