## Process
- Load CNOUS file
- Clean data and first mapping for bdd injection
- Drop duplicates
- Add default column values
- Output to CSV

## Encoding
CNOUS -> windows-1252, check for 2025

In [None]:
import csv
import os
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
import numpy as np

from data.utils.data_utils import unaccent_and_upper, format_insee_or_postal_code

load_dotenv()

cnous_filepath = os.environ['CNOUS_PATHFILE']
base_output_filepath = os.environ['DB_EXPORT']
exercice_id = 4

In [None]:
cnous_column_type = {
  'lieuNaissCodeCommuneInsee': 'str',
  'lieuNaissCodePays': 'str'
}

cnous_df = pd.read_csv(cnous_filepath, encoding='windows-1252', on_bad_lines='skip', sep=';', engine="c", dtype=cnous_column_type)

In [None]:
cnous_column_mapping = {
    # infos allocataire
    'ine': 'allocataire-matricule',

    # pas de code organisme
    'civiliteLibelleCourt': 'allocataire-qualite',
    'nom': 'allocataire-nom',
    'prenom': 'allocataire-prenom',
    'dateNaissance': 'allocataire-date_naissance',
    'mail': 'allocataire-courriel',
    'lieuNaissCodeCommuneInsee': 'allocataire-code_insee_commune_naissance',
    'lieuNaissLibelleCommune': 'allocataire-commune_naissance',
    'lieuNaissCodePays': 'allocataire-code_iso_pays_naissance',
    'lieuNaissLibellePays': 'allocataire-pays_naissance',

    # adresse allocataire
    'adresseVoie': 'adresse_allocataire-voie',
    'adresseCodePostal': 'adresse_allocataire-code_postal',
    'adresseLocalite': 'adresse_allocataire-commune',
    'adresseCodeLocalite': 'adresse_allocataire-code_insee',
    'adresseComplement1': 'adresse_allocataire-cplt_adresse',

    # infos bénéficiaires
    'genre': 'civiliteLibelleCourt',
}

# Drop unused column
df_psp_mapped_cnous = cnous_df.copy()
df_psp_mapped_cnous.drop(columns=['adresseCodePays', 'adresseComplement2'], inplace=True)

df_psp_mapped_cnous.rename(columns=cnous_column_mapping, inplace=True) 

# organisme
df_psp_mapped_cnous['organisme'] = 'cnous'
df_psp_mapped_cnous['situation'] = 'boursier'

# remove weird values
df_psp_mapped_cnous = df_psp_mapped_cnous[df_psp_mapped_cnous['allocataire-date_naissance'] != 'dateNaissance']

# infos bénéficiaires = allocataire
df_psp_mapped_cnous['date_naissance'] = pd.to_datetime(df_psp_mapped_cnous['allocataire-date_naissance'], format='%d/%m/%Y').dt.strftime('%Y-%m-%d')
df_psp_mapped_cnous['allocataire-date_naissance'] =  pd.to_datetime(df_psp_mapped_cnous['allocataire-date_naissance'], format='%d/%m/%Y')
df_psp_mapped_cnous['nom'] = df_psp_mapped_cnous['allocataire-nom']
df_psp_mapped_cnous['prenom'] = df_psp_mapped_cnous['allocataire-prenom']
df_psp_mapped_cnous['genre'] = df_psp_mapped_cnous['allocataire-qualite']
df_psp_mapped_cnous['allocataire-qualite'] = df_psp_mapped_cnous['allocataire-qualite'].replace('F', 'Mme')

In [None]:
# apply criterias on CNOUS datas
from datetime import timedelta
from dateutil.relativedelta import relativedelta

# Cut off date for eligibility for year 2025
end_date = pd.to_datetime('2025-10-15').date()
start_date = end_date - relativedelta(years=28)

cnous_situation_mask = (df_psp_mapped_cnous['date_naissance'].dt.date >= start_date) & (df_psp_mapped_cnous['date_naissance'].dt.date <= end_date)
df_psp_mapped_cnous_filtered = df_psp_mapped_cnous[cnous_situation_mask]

print(f"{len(df_psp_mapped_cnous) - len(df_psp_mapped_cnous_filtered)} rows for CNOUS dataframe were removed based on criterias")

In [None]:
# concat into a single dataframe
df_all = pd.concat([df_psp_mapped_cnous_filtered], axis=0, ignore_index=True)

# remove rows with missing necessary values (if one of those value are missing we cannot generate a code)
necessary_column = ['nom', 'prenom', 'date_naissance', 'genre']
df_all_valid_row = df_all.dropna(subset=necessary_column)

# remove columns with all null value
df_all_valid = df_all_valid_row.dropna(axis=1, how='all')

assert len(df_all_valid[df_all['nom'].isnull() | df_all_valid['prenom'].isnull() | df_all_valid['date_naissance'].isnull()]) == 0

In [None]:
# Upper case these columns for the merge
df_all_valid['prenom'] = df_all_valid['prenom'].astype(str).apply(unaccent_and_upper)
df_all_valid['nom'] = df_all_valid['nom'].astype(str).apply(unaccent_and_upper)
df_all_valid['genre'] = df_all_valid['genre'].astype(str).apply(unaccent_and_upper)

In [None]:
# lower case on emails on all
df_all_valid.loc[:,'allocataire-courriel'] = df_all_valid['allocataire-courriel'].str.lower()

In [None]:
# remove rows when beneficiary is before september 1994
mask_before = pd.to_datetime(df_all_valid['date_naissance']) > datetime(1994, 9, 16)
df_all_valid_after = df_all_valid[mask_before]

print(f"{len(df_all_valid) - len(df_all_valid_after)} rows where removed because date_naissance was before 1994")

In [None]:
# add missing 0 to phone numbers
mask_tel_not_null = df_all_valid_after['allocataire-telephone'].notna()
mask_no_zero_phone_number = ~df_all_valid_after.loc[mask_tel_not_null, 'allocataire-telephone'].str.startswith('0')
mask_9_char_phone = df_all_valid_after.loc[mask_tel_not_null, 'allocataire-telephone'].str.len() == 9
df_all_valid_after.loc[mask_tel_not_null & mask_no_zero_phone_number & mask_9_char_phone, 'allocataire-telephone'] = '0' + df_all_valid_after['allocataire-telephone']

# set '0' phone values to None
mask_tel_eq_zero = df_all_valid_after['allocataire-telephone'] == '0'
df_all_valid_after.loc[mask_tel_eq_zero, 'allocataire-telephone'] = np.NaN

# add 4h on all birthdates
df_all_valid_after.loc[:,'date_naissance'] = df_all_valid_after['date_naissance'] + timedelta(hours=4)

In [None]:
# remove duplicate beneficiaries
df_all_valid_no_duplicate = df_all_valid_after.drop_duplicates(subset=[
  'date_naissance',
  'nom', 
  'prenom', 
  'genre', 
  'organisme', 
  'situation',
  'allocataire-qualite', 
  'allocataire-matricule',
  'allocataire-code_organisme', 
  'allocataire-telephone',

  # 'allocataire-nom', 
  'allocataire-prenom', 
  'allocataire-date_naissance',
  'allocataire-courriel',

  # we can remove this column additionaly
  'allocataire-code_insee_commune_naissance',
  'allocataire-commune_naissance', 
  'allocataire-code_iso_pays_naissance',
  'allocataire-pays_naissance'
])

print(f"{len(df_all_valid_after) - len(df_all_valid_no_duplicate)} duplicate rows were removed")

In [None]:
# map to json values for target DB model 
## map allocataire json
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': row['allocataire-qualite'],
        'matricule': row['allocataire-matricule'],
        'code_organisme': row['allocataire-code_organisme'],
        'telephone': row['allocataire-telephone'],
        'nom': unaccent_and_upper(row['allocataire-nom']),
        'prenom': unaccent_and_upper(row['allocataire-prenom']),
        'date_naissance': row['allocataire-date_naissance'],
        'courriel': row['allocataire-courriel'],
        'code_insee_commune_naissance': row['allocataire-code_insee_commune_naissance'],
        'commune_naissance': row['allocataire-commune_naissance'],
        'code_iso_pays_naissance': row['allocataire-code_iso_pays_naissance'],
        'pays_naissance': row['allocataire-pays_naissance']
    }
    filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)

df_all_valid_no_duplicate['allocataire'] = df_all_valid_no_duplicate.apply(to_json_allocataire_without_null, axis=1)

In [None]:
## map adresse_allocataire json
def to_json_adresse_without_null(row):
    adresse_mapping = {
        'voie': row['adresse_allocataire-voie'],
        'code_postal': format_insee_or_postal_code(row['adresse_allocataire-code_postal']),
        'nom_adresse_postale': row['adresse_allocataire-nom_adresse_postale'],
        'commune': row['adresse_allocataire-commune'],
        'code_insee': format_insee_or_postal_code(row['adresse_allocataire-code_insee']),
        'cplt_adresse': row['adresse_allocataire-cplt_adresse'],
    }
    
    filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_address, ensure_ascii=False)

df_all_valid_no_duplicate['adresse_allocataire'] = df_all_valid_no_duplicate.apply(to_json_adresse_without_null, axis=1)

In [None]:
## drop null value
df_final = df_all_valid_no_duplicate.drop(columns=[
  'allocataire-qualite',
  'allocataire-matricule',
  'allocataire-code_organisme',
  'allocataire-nom',
  'allocataire-prenom',
  'allocataire-telephone',
  'allocataire-date_naissance',
  'allocataire-courriel',
  'allocataire-code_insee_commune_naissance',
  'allocataire-commune_naissance',
  'allocataire-code_iso_pays_naissance',
  'allocataire-pays_naissance',
  'adresse_allocataire-voie',
  'adresse_allocataire-nom_adresse_postale',
  'adresse_allocataire-code_postal',
  'adresse_allocataire-commune',
  'adresse_allocataire-code_insee',
  'adresse_allocataire-cplt_adresse',
])


In [None]:
# Add missing default column needed for target DB model
import datetime

df_final.loc[:,'updated_at'] = datetime.datetime.now()
df_final.loc[:,'exercice_id'] = exercice_id
df_final.loc[:,'uuid_doc'] = np.NaN

In [None]:
# output to CSV
df_final.to_csv(base_output_filepath)