In [1]:
import polars as pl
from pathlib import Path
import datetime

In [3]:
files_dir = Path("../files")  # Dossier au même niveau que le répertoire actuel

In [5]:
# Charger le CSV avec les bonnes colonnes
df = pl.read_csv(
    files_dir/ "nom_prenoms_lower_trim.csv",
    separator=",",  # Ajuster selon le séparateur réel
    new_columns=["nom", "prenoms", "sexe", "age"],
    dtypes={
        "nom": pl.Utf8,
        "prenoms": pl.Utf8,
        "sexe": pl.Categorical,  # Optimisation mémoire
        "age": pl.UInt8
    }
)

  df = pl.read_csv(


In [4]:
df

nom,prenoms,sexe,age
str,str,cat,u8
"""lago""","""serge""","""Masculin""",43
"""lago""","""renaud""","""Masculin""",40
"""lago""","""therese""","""Féminin""",68
"""bode""","""charleine""","""Féminin""",25
"""dina""","""dina""","""Féminin""",27
…,…,…,…
"""diallo""","""maiga""","""Masculin""",46
"""diallo""","""amina""","""Féminin""",28
"""diallo""","""ramata""","""Féminin""",21
"""ouedrago""","""salif""","""Masculin""",24


In [5]:
# 2. Calculer l'année de naissance (ex: 2023 - age)
current_year = 2021
df = df.with_columns(
    annee_naissance = current_year - pl.col("age")
)

In [6]:
df

nom,prenoms,sexe,age,annee_naissance
str,str,cat,u8,u16
"""lago""","""serge""","""Masculin""",43,1978
"""lago""","""renaud""","""Masculin""",40,1981
"""lago""","""therese""","""Féminin""",68,1953
"""bode""","""charleine""","""Féminin""",25,1996
"""dina""","""dina""","""Féminin""",27,1994
…,…,…,…,…
"""diallo""","""maiga""","""Masculin""",46,1975
"""diallo""","""amina""","""Féminin""",28,1993
"""diallo""","""ramata""","""Féminin""",21,2000
"""ouedrago""","""salif""","""Masculin""",24,1997


In [7]:
df_exploded = df.with_columns(
    pl.col("prenoms").str.split(" ").alias("prenoms_split")
).explode("prenoms_split").rename({"prenoms_split": "prenom_explode"})

In [8]:
df_exploded

nom,prenoms,sexe,age,annee_naissance,prenom_explode
str,str,cat,u8,u16,str
"""lago""","""serge""","""Masculin""",43,1978,"""serge"""
"""lago""","""renaud""","""Masculin""",40,1981,"""renaud"""
"""lago""","""therese""","""Féminin""",68,1953,"""therese"""
"""bode""","""charleine""","""Féminin""",25,1996,"""charleine"""
"""dina""","""dina""","""Féminin""",27,1994,"""dina"""
…,…,…,…,…,…
"""diallo""","""maiga""","""Masculin""",46,1975,"""maiga"""
"""diallo""","""amina""","""Féminin""",28,1993,"""amina"""
"""diallo""","""ramata""","""Féminin""",21,2000,"""ramata"""
"""ouedrago""","""salif""","""Masculin""",24,1997,"""salif"""


In [9]:
# Calcul des occurrences annuelles
yearly_stats = df_exploded.group_by(
    "prenom_explode", "annee_naissance"
).agg(
    pl.count().alias("count")
).pivot(
    values="count",
    index="prenom_explode",
    columns="annee_naissance",
    aggregate_function="sum"
).fill_null(0)

# Renommer les colonnes d'année
year_columns = [str(year) for year in range(1886, 2022)]
yearly_stats = yearly_stats.rename(
    {str(year): f"annee_{year}" for year in range(1886, 2022)}
)


  pl.count().alias("count")
  ).pivot(


In [10]:
yearly_stats_sorted = yearly_stats.select(sorted(yearly_stats.columns))

In [11]:
yearly_stats_sorted

annee_1886,annee_1887,annee_1888,annee_1889,annee_1890,annee_1891,annee_1892,annee_1893,annee_1894,annee_1895,annee_1896,annee_1897,annee_1898,annee_1899,annee_1900,annee_1901,annee_1902,annee_1903,annee_1904,annee_1905,annee_1906,annee_1907,annee_1908,annee_1909,annee_1910,annee_1911,annee_1912,annee_1913,annee_1914,annee_1915,annee_1916,annee_1917,annee_1918,annee_1919,annee_1920,annee_1921,annee_1922,…,annee_1986,annee_1987,annee_1988,annee_1989,annee_1990,annee_1991,annee_1992,annee_1993,annee_1994,annee_1995,annee_1996,annee_1997,annee_1998,annee_1999,annee_2000,annee_2001,annee_2002,annee_2003,annee_2004,annee_2005,annee_2006,annee_2007,annee_2008,annee_2009,annee_2010,annee_2011,annee_2012,annee_2013,annee_2014,annee_2015,annee_2016,annee_2017,annee_2018,annee_2019,annee_2020,annee_2021,prenom_explode
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,str
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,2,1,1,3,1,0,1,0,1,0,1,2,1,2,1,4,1,0,0,1,2,1,3,3,1,1,1,1,1,1,0,0,2,0,2,1,"""zirimba"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,1,0,1,1,2,2,1,0,0,0,0,0,2,0,1,1,1,0,4,0,0,0,0,0,0,0,0,0,2,1,1,2,1,0,0,1,"""n'gonan"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"""djemsse"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"""marie-daniele"""
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,1,0,0,0,2,1,0,…,198,173,156,198,188,261,191,219,170,216,237,211,224,247,249,269,258,286,314,315,311,274,354,372,314,347,288,351,336,340,365,331,369,296,268,250,"""karamoko"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""yataga"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,"""zeiza"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,"""illustte"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"""tebaragua"""


In [12]:
stats = (
    pl.concat([
        df_exploded.select(
            pl.col("prenom_explode").alias("nom"), 
            "sexe"
        ).with_columns(type=pl.lit("prenoms")),
        
        df.select(
            pl.col("nom").alias("nom"),
            "sexe"
        ).with_columns(type=pl.lit("famille"))
    ])
    .group_by("nom", "sexe", "type")
    .agg(count=pl.len())
    .pivot(
        values="count",
        index="nom",
        columns=["sexe", "type"],
        aggregate_function="sum"
    )
    .fill_null(0)
    .rename({
        '{"Féminin","famille"}': "nombre_femme_nom_famille",
        '{"Féminin","prenoms"}': "nombre_femme_prenom",
        '{"Masculin","famille"}': "nombre_homme_nom_famille",
        '{"Masculin","prenoms"}': "nombre_homme_prenom"  # Correction orthographique
    })
    
)


  .pivot(


In [13]:
stats

nom,nombre_femme_prenom,nombre_homme_nom_famille,nombre_homme_prenom,nombre_femme_nom_famille
str,u32,u32,u32,u32
"""wamanisa""",1,0,0,0
"""ayouwoh""",0,1,0,0
"""niamekey""",4,2,2,1
"""koffitohma""",0,0,1,0
"""sanisssi""",0,1,0,0
…,…,…,…,…
"""obrel""",0,0,1,0
"""grodjie""",1,0,0,0
"""wedraho""",0,2,0,0
"""nonkoungou""",0,0,0,1


In [25]:
prenom_unique_stats = (
    df_exploded.filter(pl.col("prenoms") == pl.col("prenom_explode"))
    .group_by("prenom_explode")
    .agg(prenom_unique=pl.len())
)

In [26]:
prenom_unique_stats

prenom_explode,prenom_unique
str,u32
"""keuleya""",2
"""anaballe""",1
"""sinimin""",1
"""alsonce""",1
"""sorel""",47
…,…
"""koumannayougou""",1
"""peinelle""",1
"""tenenanko""",1
"""yoryar""",1


In [27]:
final_stats = (
    stats.join(
        prenom_unique_stats,
        left_on="nom",
        right_on="prenom_explode",
        how="left"
    )
    .with_columns(
        partie_prenom=pl.col("nombre_homme_prenom") + pl.col("nombre_femme_prenom"),
        nom_famille=pl.col("nombre_homme_nom_famille") + pl.col("nombre_femme_nom_famille"),
        prenom_unique=pl.coalesce("prenom_unique", 0)
    )
    .select([
        "nom",
        "partie_prenom",
        "nom_famille",
        "prenom_unique",
        "nombre_homme_prenom",
        "nombre_homme_nom_famille",
        "nombre_femme_prenom",
        "nombre_femme_nom_famille"
    ])
)

In [None]:
#Fusion avec les statistiques existantes
final_stats = stats.join(
    yearly_stats,
    left_on="nom",
    right_on="prenom_explode",
    how="left"
).drop("prenom_explode")

# Réordonnancement des colonnes
columns_order = (
    ["nom", "partie_prenom", "nom_famille", "prenom_unique"] + 
    [f"annee_{year}" for year in range(1960, 2022)] +
    ["nombre_homme_prenom", "nombre_homme_nom_famille", 
     "nombre_femme_prenom", "nombre_femme_nom_famille"]
)

final_stats = final_stats.select(columns_order)

In [28]:
final_stats

nom,partie_prenom,nom_famille,prenom_unique,nombre_homme_prenom,nombre_homme_nom_famille,nombre_femme_prenom,nombre_femme_nom_famille
str,u32,u32,u32,u32,u32,u32,u32
"""djeggrou""",1,0,1,0,0,1,0
"""zoumara""",12,32,12,11,17,1,15
"""faouziarouzi""",1,0,1,0,0,1,0
"""frevrin""",1,0,0,1,0,0,0
"""cadret""",3,1,2,3,1,0,0
…,…,…,…,…,…,…,…
"""yoranatere""",1,0,0,0,0,1,0
"""orlelle""",1,0,0,0,0,1,0
"""sorre awa""",0,1,0,0,0,0,1
"""garassouba""",1,0,1,0,0,1,0


In [67]:
final_stats.write_parquet(
     files_dir / "nom_prenoms_overlay_vf_lower_trim.parquet",
       compression="zstd"
)