# Exploration Gold

### 1. Exploration des donn√©es Gold en Notebook

Voici une cellule optimis√©e pour inspecter ton fichier Master et tes KPIs. On utilise DuckDB pour garder la m√™me logique que ton pipeline

In [4]:
from pathlib import Path

import duckdb
import pandas as pd

from sirene_pipeline.config import settings

# 1. Setup paths (handling the notebooks/ directory offset)
root_path = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
gold_dir = root_path / settings.gold.output_dir

con = duckdb.connect()

# 2. Check the Master Table (The enriched one)
master_file = gold_dir / settings.gold.master_filename
if master_file.exists():
    print(f"‚úÖ Loading Master Table: {master_file.name}")
    con.execute(
        f"CREATE OR REPLACE VIEW master AS SELECT * FROM read_parquet('{master_file.as_posix()}')"
    )

    # Show a few rows to check the join result
    display(
        con.execute("""
        SELECT siret, denominationUniteLegale, categorieEntreprise, departement, secteur_activite 
        FROM master 
        LIMIT 5
    """).df()
    )
else:
    print("‚ùå Master file not found. Did you run the Gold job?")

# 3. Dynamic check of all Gold KPIs
print(f"\n--- Checking all KPIs in {gold_dir} ---")

# We iterate over the KPI dictionary defined in settings.toml
for kpi_name, filename in settings.gold.kpis.items():
    kpi_file = gold_dir / filename

    print(f"\nüìä KPI: {kpi_name} ({filename})")

    if kpi_file.exists():
        # Read and display the first few rows
        df_kpi = pd.read_parquet(kpi_file)
        print(f"‚úÖ Found {len(df_kpi)} rows.")
        display(df_kpi.head())
    else:
        print("‚ùå File not found. Make sure the Gold job has finished.")

‚úÖ Loading Master Table: sirene_master_enriched.parquet


Unnamed: 0,siret,denominationUniteLegale,categorieEntreprise,departement,secteur_activite
0,30182971900030,,,75,92
1,30182971900048,,,75,93
2,30183015400011,,,75,64
3,30183023800020,,,75,64
4,30183120200017,,,75,64



--- Checking all KPIs in c:\Users\atexier\Documents\Projet_Sirene\data\gold ---

üìä KPI: dept_dist (kpi_establishments_by_dept.parquet)
‚úÖ Found 8 rows.


Unnamed: 0,departement,total_establishments
0,75,3522
1,92,1140
2,93,904
3,94,830
4,78,696



üìä KPI: sectors (kpi_dominant_sectors.parquet)
‚úÖ Found 8 rows.


Unnamed: 0,departement,secteur_activite,count
0,92,64,96
1,91,55,53
2,95,52,47
3,78,52,60
4,93,64,101



üìä KPI: size_dist (kpi_business_size.parquet)
‚úÖ Found 0 rows.


Unnamed: 0,categorieEntreprise,total


Cette cellule sert √† transformer le fichier parquet gold en csv.

In [None]:
from pathlib import Path

import pandas as pd
from loguru import logger


def export_sampled_master_data(input_path: str, output_path: str, n_rows: int = 500000) -> None:
    """Extracts a random sample from the master Parquet file and saves it as CSV.

    This sampling strategy ensures the final file remains under the 1GB limit
    for Tableau Public while maintaining a representative dataset.

    Args:
        input_path: String path to the enriched master Parquet file.
        output_path: String path where the sampled CSV will be saved.
        n_rows: Number of rows to randomly sample. Defaults to 500,000.
    """
    try:
        logger.info(f"üìÇ Reading master data from {input_path}")
        # Load the parquet file
        df = pd.read_parquet(input_path)

        # Ensure we don't try to sample more rows than available
        sample_size = min(n_rows, len(df))

        logger.info(f"üé≤ Extracting a random sample of {sample_size:,} rows")
        # Perform random sampling (random_state ensures reproducibility)
        df_sample = df.sample(n=sample_size, random_state=42)

        logger.info(f"üíæ Saving sample to {output_path}")
        # Export to CSV
        df_sample.to_csv(output_path, index=False)

        logger.success(f"‚úÖ Successfully exported {sample_size:,} rows to CSV.")

    except Exception as e:
        logger.error(f"‚ùå Failed to export sample: {e}")
        raise


if __name__ == "__main__":
    # Define your paths
    INPUT_FILE = "../data/gold/sirene_master_enriched.parquet"
    OUTPUT_FILE = "../data/gold/master_gold_sample.csv"

    export_sampled_master_data(INPUT_FILE, OUTPUT_FILE)