## Process
- Load CNOUS file
- Clean data and first mapping for bdd injection
- Drop duplicates
- Add default column values
- Output to CSV

## Encoding
CNOUS -> windows-1252, check for 2025

In [None]:
from utils.data_utils import unaccent_and_upper, format_insee_or_postal_code, get_country_from_iso
import os
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
import numpy as np

load_dotenv()

cnous_filepath = os.environ['CNOUS_PATHFILE_2025']
base_output_filepath = os.environ['DB_CNOUS_EXPORT_2025']
exercice_id = 4

In [None]:
df_cnous = pd.read_csv(cnous_filepath, encoding='utf-8', on_bad_lines='skip', sep=',', engine="c", dtype=str)

In [None]:
df_cnous.columns

In [None]:
# Cleaning and formatting
df_cnous['allocataire-nom'] = df_cnous['allocataire-nom'].astype(str).str.strip().apply(unaccent_and_upper)
df_cnous['allocataire-prenom'] = df_cnous['allocataire-prenom'].astype(str).str.strip().apply(unaccent_and_upper)
df_cnous['allocataire-qualite'] = df_cnous['allocataire-qualite'].astype(str).str.strip().apply(unaccent_and_upper)

df_cnous.loc[:,'prenom'] = df_cnous['allocataire-prenom']
df_cnous.loc[:,'nom'] = df_cnous['allocataire-nom']
df_cnous.loc[:,'genre'] = df_cnous['allocataire-qualite']

In [None]:
df_cnous.loc[:,'organisme'] = 'cnous'
df_cnous.loc[:,'situation'] = 'boursier'
df_cnous.loc[:,'allocataire-courriel'] = df_cnous['allocataire-courriel'].str.lower()
df_cnous.loc[:, 'allocataire-qualite'] = df_cnous['allocataire-qualite'].replace('F', 'Mme')

In [None]:
# infos bénéficiaires = allocataire
df_cnous.loc[:,'date_naissance'] = pd.to_datetime(df_cnous['allocataire-date_naissance'], format='%d/%m/%Y')
df_cnous.loc[:,'allocataire-date_naissance'] =  pd.to_datetime(df_cnous['allocataire-date_naissance'], format='%d/%m/%Y').dt.strftime('%d/%m/%Y')

In [None]:
df_cnous.loc[:, 'allocataire-commune_naissance'] = df_cnous['allocataire-commune_naissance'].str.upper()

In [None]:
# apply criterias on CNOUS datas
from datetime import timedelta

mask_dob_start = df_cnous['date_naissance'] >= datetime(1997, 1, 1)
mask_dob_end = df_cnous['date_naissance'] <= datetime(2025, 12, 31)
mask_dob = mask_dob_start & mask_dob_end

df_cnous_filtered = df_cnous[mask_dob]

print(f"{len(df_cnous) - len(df_cnous_filtered)} rows for CNOUS dataframe were removed based on criterias")
print(f"{len(df_cnous_filtered)}")

In [None]:
# remove rows with missing necessary values (if one of those value are missing we cannot generate a code)
necessary_column = ['nom', 'prenom', 'date_naissance', 'genre']
df_all_valid_row = df_cnous_filtered.dropna(subset=necessary_column)

# remove columns with all null value
df_all_valid = df_all_valid_row.dropna(axis=1, how='all')

print(f"{len(df_all_valid)}")
assert len(df_all_valid[df_cnous_filtered['nom'].isnull() | df_all_valid['prenom'].isnull() | df_all_valid['date_naissance'].isnull()]) == 0

In [None]:
# add 4h on all birthdates
df_all_valid.loc[:,'date_naissance'] = df_all_valid['date_naissance'] + timedelta(hours=4)

In [None]:
len(df_all_valid)

In [None]:
df_all_valid['allocataire-commune_naissance'].value_counts()

In [None]:
from utils.data_utils import get_country_from_iso

# map to json values for target DB model
## map allocataire json
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': row['allocataire-qualite'],
        'nom': unaccent_and_upper(row['allocataire-nom']),
        'prenom': unaccent_and_upper(row['allocataire-prenom']),
        'date_naissance': row['allocataire-date_naissance'],
        'courriel': row['allocataire-courriel'],
        'code_insee_commune_naissance': format_insee_or_postal_code(row['allocataire-code_insee_commune_naissance']),
        'commune_naissance': row['allocataire-commune_naissance'],
        'code_iso_pays_naissance': row['allocataire-code_iso_pays_naissance'].upper(),
        'pays_naissance': get_country_from_iso(row['allocataire-code_iso_pays_naissance'].upper()).upper()
    }
    filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)

df_all_valid['allocataire'] = df_all_valid.apply(to_json_allocataire_without_null, axis=1)

In [None]:
len(df_all_valid)

In [None]:
## map adresse_allocataire json
def to_json_adresse_without_null(row):
    adresse_mapping = {
        'voie': row['adresse_allocataire-voie'],
        'code_postal': format_insee_or_postal_code(row['adresse_allocataire-code_postal']),
        'nom_adresse_postale': row['adresse_allocataire-nom_adresse_postale'],
        'commune': row['adresse_allocataire-commune'],
        'code_insee': format_insee_or_postal_code(row['adresse_allocataire-code_insee']),
        'cplt_adresse': row['adresse_allocataire-cplt_adresse'],
    }
    
    filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_address, ensure_ascii=False)

df_all_valid['adresse_allocataire'] = df_all_valid.apply(to_json_adresse_without_null, axis=1)

In [None]:
## drop null value
df_final = df_all_valid.drop(columns=[
  'allocataire-qualite',
  'allocataire-matricule',
  'allocataire-code_organisme',
  'allocataire-nom',
  'allocataire-prenom',
  'allocataire-telephone',
  'allocataire-date_naissance',
  'allocataire-courriel',
  'allocataire-code_insee_commune_naissance',
  'allocataire-commune_naissance',
  'allocataire-code_iso_pays_naissance',
  'allocataire-pays_naissance',
  'adresse_allocataire-voie',
  'adresse_allocataire-nom_adresse_postale',
  'adresse_allocataire-code_postal',
  'adresse_allocataire-commune',
  'adresse_allocataire-code_insee',
  'adresse_allocataire-cplt_adresse',
])

In [None]:
# Add missing default column needed for target DB model
import datetime

df_final.loc[:,'updated_at'] = datetime.datetime.now()
df_final.loc[:,'exercice_id'] = exercice_id
df_final.loc[:,'uuid_doc'] = np.NaN

In [None]:
# output to CSV
df_final.to_csv(base_output_filepath)