## Process
- Load CNAF file
- Clean data and first mapping for bdd injection
- Drop duplicates
- Add default column values
- Output to CSV

## Encoding
CNAF -> ISO-8859-1, check for 2025

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
import numpy as np
from data.utils.data_utils import unaccent_and_upper, format_insee_or_postal_code

load_dotenv()

cnaf_filepath = os.environ['CNAF_PATHFILE']
base_output_filepath = os.environ['DB_CNAF_EXPORT']
exercice_id = 5

In [None]:
# CNAF
cnaf_column_type = {
    'CODORG': 'str',
    'MATRICULE': 'str',
    'QUALDOS': 'str',
    'RESPDOS': 'str',
    'PRENOMDOS': 'str',
    'NOMCOMPLET': 'str',
    'ADRLIG1DESTDOS': 'str',
    'ADRLIG2DESTDOS': 'str',
    'ADRLIG3DESTDOS': 'str',
    'ADRLIG4DESTDOS': 'str',
    'ADRLIG5DESTDOS': 'str',
    'ADRLIG6DESTDOS': 'str',
    'NUMINSEE': 'str',
    'ADRMAIL': 'str',
    'NUMTEL': 'str',
    'NOMENF': 'str',
    'PRENOMENF': 'str',
    'DTNAIENF': 'str',
    'SEXENF': 'str',
}

cnaf_df = pd.read_parquet(cnaf_filepath, engine='pyarrow')

In [None]:
# Clean white spaces within all columns
for col in list(cnaf_df.columns):
  cnaf_df[col] = cnaf_df[col].str.strip()

In [None]:
# Explode postal code & commune from initial column containing both
cnaf_df[['CODE_POSTAL', 'COMMUNE']] = cnaf_df['ADRLIG5DESTDOS'].str.split(' ', n=1, expand=True)
cnaf_df[['CODE_POSTAL', 'COMMUNE']] = cnaf_df[['CODE_POSTAL', 'COMMUNE']].transform(lambda x: x.str.strip())

In [None]:
# Clean extra white spaces
cnaf_df['NOMCOMPLET'] = cnaf_df['NOMCOMPLET'].astype(str).str.replace(r'\s+', ' ', regex=True)

In [None]:
# map CNAF
cnaf_column_mapping = {
    # infos about allocataire
    'MATRICULE': 'allocataire-matricule',
    'CODORG': 'allocataire-code_organisme',
    'QUALDOS': 'allocataire-qualite',
    'RESPDOS': 'allocataire-nom',
    'PRENOMDOS': 'allocataire-prenom',
    'ADRMAIL': 'allocataire-courriel',
    'NUMTEL': 'allocataire-telephone',
    # birthdate for allocataire missing from csv, need to be set to .na() later

    # adresse allocataire
    'CODE_POSTAL': 'adresse_allocataire-code_postal',
    'COMMUNE': 'adresse_allocataire-commune',
    'NUMINSEE': 'adresse_allocataire-code_insee',

    # infos about beneficiary
    'DTNAIENF': 'date_naissance',
    'SEXENF': 'genre',
    'NOMENF': 'nom',
    'PRENOMENF': 'prenom',
}

df_psp_mapped_cnaf = cnaf_df.copy()
df_psp_mapped_cnaf = df_psp_mapped_cnaf.rename(columns=cnaf_column_mapping)

In [None]:
# Allocataire's date of birth (missing from original file)
df_psp_mapped_cnaf['allocataire-date_naissance'] = np.NaN

# Allocataire missing phone number
phone_replacements = {
    '0000000000': np.NaN,
    '0600000000': np.NaN,
    '0700000001': np.NaN,
    '0100000000': np.NaN,
    '0400000000': np.NaN,
    '0600000001': np.NaN,
    '0700000000': np.NaN
}

df_psp_mapped_cnaf['allocataire-telephone'] = df_psp_mapped_cnaf['allocataire-telephone'].replace(phone_replacements)

# Allocataire's qualite
df_psp_mapped_cnaf['allocataire-qualite'] = df_psp_mapped_cnaf['allocataire-qualite'].str.strip().replace(
    {'MME': 'Mme', 'MR': 'M'})

# Additionnal address details
df_psp_mapped_cnaf['adresse_allocataire-cplt_adresse'] = (
    df_psp_mapped_cnaf['ADRLIG1DESTDOS'] + ' ' + df_psp_mapped_cnaf['ADRLIG2DESTDOS']).str.strip()

# Allocataire's street address
df_psp_mapped_cnaf['adresse_allocataire-voie'] = (
    df_psp_mapped_cnaf['ADRLIG3DESTDOS'] + ' ' + df_psp_mapped_cnaf['ADRLIG4DESTDOS']).str.strip()

# Organism
df_psp_mapped_cnaf['organisme'] = 'CAF'

# Situation
df_psp_mapped_cnaf['situation'] = 'jeune'

mask_nom_equal = df_psp_mapped_cnaf['allocataire-nom'] == df_psp_mapped_cnaf['nom']
mask_prenom_equal = df_psp_mapped_cnaf['allocataire-prenom'] == df_psp_mapped_cnaf['prenom']
df_psp_mapped_cnaf.loc[mask_nom_equal & mask_prenom_equal, 'situation'] = 'AAH'

In [None]:
# Format date_naissance to datetime python object for processing
df_psp_mapped_cnaf['date_naissance'] = pd.to_datetime(df_psp_mapped_cnaf['date_naissance'], format='%d/%m/%Y')

# Beneficaries born before 2005 are AAH
mask_dn_before = df_psp_mapped_cnaf['date_naissance'].dt.date < datetime(2005, 6, 1).date()
df_psp_mapped_cnaf.loc[mask_dn_before, 'situation'] = 'AAH'
df_psp_mapped_cnaf['allocataire-qualite'] = df_psp_mapped_cnaf['allocataire-qualite'].replace('MR', 'M')

In [None]:
# remove unused 
df_psp_mapped_cnaf = df_psp_mapped_cnaf.drop(columns=[
    'NOMCOMPLET',
    'ADRLIG1DESTDOS',
    'ADRLIG2DESTDOS',
    'ADRLIG3DESTDOS',
    'ADRLIG4DESTDOS',
    'ADRLIG5DESTDOS',
    'ADRLIG6DESTDOS'
])

In [None]:
# remove rows with missing necessary values (if one of those value are missing we cannot generate a code)
necessary_column = ['nom', 'prenom', 'date_naissance', 'genre']
df_valid_row = df_psp_mapped_cnaf.dropna(subset=necessary_column)

# remove columns with all null value
df_valid = df_valid_row.dropna(axis=1, how='all')

assert len(
    df_valid[df_psp_mapped_cnaf['nom'].isnull() | df_valid['prenom'].isnull() | df_valid['date_naissance'].isnull()]
) == 0

In [None]:
# Upper case these columns for the merge
df_valid['prenom'] = df_valid['prenom'].astype(str).apply(unaccent_and_upper)
df_valid['nom'] = df_valid['nom'].astype(str).apply(unaccent_and_upper)
df_valid['genre'] = df_valid['genre'].astype(str).apply(unaccent_and_upper)

In [None]:
# lower case on emails on all
df_valid['allocataire-courriel'] = df_valid['allocataire-courriel'].str.lower()

In [None]:
# remove rows when beneficiary is before september 1994
mask_before = pd.to_datetime(df_valid['date_naissance']) > datetime(1994, 9, 16)
df_valid_after = df_valid[mask_before]

print(f"{len(df_valid) - len(df_valid_after)} rows where removed because date_naissance was before 1994")

In [None]:
from datetime import timedelta

# add missing 0 to phone numbers
mask_tel_not_null = df_valid_after['allocataire-telephone'].notna()
mask_no_zero_phone_number = ~df_valid_after.loc[mask_tel_not_null, 'allocataire-telephone'].str.startswith('0')
mask_9_char_phone = df_valid_after.loc[mask_tel_not_null, 'allocataire-telephone'].str.len() == 9
df_valid_after.loc[
    mask_tel_not_null & mask_no_zero_phone_number & mask_9_char_phone, 'allocataire-telephone'] = '0' + \
                                                                                                  df_valid_after[
                                                                                                      'allocataire-telephone']

# set '0' phone values to None
mask_tel_eq_zero = df_valid_after['allocataire-telephone'] == '0'
df_valid_after.loc[mask_tel_eq_zero, 'allocataire-telephone'] = np.NaN

# add 4h on all birthdates
df_valid_after['date_naissance'] = df_valid_after['date_naissance'] + timedelta(hours=4)

In [None]:
# remove duplicate beneficiaries
df_valid_no_duplicate = df_valid_after.drop_duplicates(subset=[
    'date_naissance',
    'nom',
    'prenom',
    'genre',
    'organisme',
    'situation',
    'allocataire-qualite',
    'allocataire-matricule',
    'allocataire-code_organisme',
    'allocataire-telephone',
    'allocataire-nom',
    'allocataire-prenom',
    'allocataire-courriel',
])

print(f"{len(df_valid_after) - len(df_valid_no_duplicate)} duplicate rows were removed")

In [None]:
# map allocataire json
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': row['allocataire-qualite'],
        'matricule': row['allocataire-matricule'],
        'code_organisme': row['allocataire-code_organisme'],
        'telephone': row['allocataire-telephone'],
        'nom': unaccent_and_upper(row['allocataire-nom']),
        'prenom': unaccent_and_upper(row['allocataire-prenom']),
        'courriel': row['allocataire-courriel']
    }
    filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)


df_valid_no_duplicate['allocataire'] = df_valid_no_duplicate.apply(to_json_allocataire_without_null, axis=1)

In [None]:
# map adresse_allocataire json
def to_json_adresse_without_null(row):
    adresse_mapping = {
        'voie': row['adresse_allocataire-voie'],
        'code_postal': format_insee_or_postal_code(row['adresse_allocataire-code_postal']),
        'commune': row['adresse_allocataire-commune'],
        'code_insee': format_insee_or_postal_code(row['adresse_allocataire-code_insee']),
        'cplt_adresse': row['adresse_allocataire-cplt_adresse'],
    }

    filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_address, ensure_ascii=False)


df_valid_no_duplicate['adresse_allocataire'] = df_valid_no_duplicate.apply(to_json_adresse_without_null, axis=1)

In [None]:
## drop null value
df_final = df_valid_no_duplicate.drop(columns=[
    'allocataire-qualite',
    'allocataire-matricule',
    'allocataire-code_organisme',
    'allocataire-nom',
    'allocataire-prenom',
    'allocataire-telephone',
    'allocataire-courriel',
    'adresse_allocataire-voie',
    'adresse_allocataire-code_postal',
    'adresse_allocataire-commune',
    'adresse_allocataire-code_insee',
    'adresse_allocataire-cplt_adresse',
])

In [None]:
# Add missing default column needed for target DB model
import datetime

df_final['updated_at'] = datetime.datetime.now()
df_final['exercice_id'] = exercice_id
df_final['uuid_doc'] = np.NaN

In [None]:
# output to CSV
df_final['updated_at'] = df_final['updated_at'].astype(str)
df_final['date_naissance'] = df_final['date_naissance'].astype(str)

# df_final.to_parquet('cnaf-export.parquet', index=False)
df_final.to_csv('cnaf-export.csv', sep=';', index=False, encoding='utf-8')

In [None]:
len(df_final)