In [None]:
from utils.data_utils import unaccent_and_upper, format_insee_or_postal_code, get_country_from_iso
import os
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
import numpy as np

load_dotenv()

cnous_filepath = os.environ['CNOUS_PATHFILE_2025']
base_output_filepath = os.environ['DB_CNOUS_EXPORT_2025']
exercice_id = 4

df_cnous = pd.read_csv(cnous_filepath, encoding='utf-8', on_bad_lines='skip', sep=',', engine="c", dtype=str)

In [None]:
# Cleaning and formatting
df_cnous['allocataire-nom'] = df_cnous['allocataire-nom'].astype(str).str.strip().apply(unaccent_and_upper)
df_cnous['allocataire-prenom'] = df_cnous['allocataire-prenom'].astype(str).str.strip().apply(unaccent_and_upper)
df_cnous['allocataire-qualite'] = df_cnous['allocataire-qualite'].astype(str).str.strip().apply(unaccent_and_upper)

df_cnous.loc[:,'prenom'] = df_cnous['allocataire-prenom']
df_cnous.loc[:,'nom'] = df_cnous['allocataire-nom']
df_cnous.loc[:,'genre'] = df_cnous['allocataire-qualite']

In [None]:
df_cnous.loc[:,'organisme'] = 'cnous'
df_cnous.loc[:,'situation'] = 'boursier'
df_cnous.loc[:,'allocataire-courriel'] = df_cnous['allocataire-courriel'].str.lower()
df_cnous.loc[:, 'allocataire-qualite'] = df_cnous['allocataire-qualite'].replace('F', 'Mme')

In [None]:
# infos bénéficiaires = allocataire
df_cnous.loc[:,'date_naissance'] = pd.to_datetime(df_cnous['allocataire-date_naissance'], format='%d/%m/%Y')
df_cnous.loc[:,'allocataire-date_naissance'] =  pd.to_datetime(df_cnous['allocataire-date_naissance'], format='%d/%m/%Y').dt.strftime('%d/%m/%Y')

In [None]:
df_cnous.loc[:, 'allocataire-commune_naissance'] = df_cnous['allocataire-commune_naissance'].str.upper()

In [None]:
# apply criterias on CNOUS datas
from datetime import timedelta

mask_dob_start = df_cnous['date_naissance'] >= datetime(1997, 1, 1)
mask_dob_end = df_cnous['date_naissance'] <= datetime(2025, 12, 31)
mask_dob = mask_dob_start & mask_dob_end

df_cnous_filtered = df_cnous[mask_dob]

print(f"{len(df_cnous) - len(df_cnous_filtered)} rows for CNOUS dataframe were removed based on criterias")
print(f"{len(df_cnous_filtered)}")

In [None]:
# remove rows with missing necessary values (if one of those value are missing we cannot generate a code)
necessary_column = ['nom', 'prenom', 'date_naissance', 'genre']
df_all_valid_row = df_cnous_filtered.dropna(subset=necessary_column)

# remove columns with all null value
df_all_valid = df_all_valid_row.dropna(axis=1, how='all')

print(f"{len(df_all_valid)}")
assert len(df_all_valid[df_cnous_filtered['nom'].isnull() | df_all_valid['prenom'].isnull() | df_all_valid['date_naissance'].isnull()]) == 0

In [None]:
# add 4h on all birthdates
df_all_valid.loc[:,'date_naissance'] = df_all_valid['date_naissance'] + timedelta(hours=4)

In [None]:
len(df_all_valid)

In [None]:
df_all_valid['allocataire-code_iso_pays_naissance'].fillna('FR', inplace=True)

# When iso pays naissance is na, we assume it is from FRANCE
df_all_valid[df_all_valid['allocataire-code_iso_pays_naissance'].isna()]

In [None]:
from utils.data_utils import get_country_from_iso

# map to json values for target DB model
## map allocataire json
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': row['allocataire-qualite'],
        'nom': unaccent_and_upper(row['allocataire-nom']),
        'prenom': unaccent_and_upper(row['allocataire-prenom']),
        'date_naissance': row['allocataire-date_naissance'],
        'courriel': row['allocataire-courriel'],
        'code_insee_commune_naissance': format_insee_or_postal_code(row['allocataire-code_insee_commune_naissance']),
        'commune_naissance': row['allocataire-commune_naissance'],
        'matricule': row['allocataire-matricule'],
        'code_iso_pays_naissance': row['allocataire-code_iso_pays_naissance'].upper(),
        'pays_naissance': get_country_from_iso(row['allocataire-code_iso_pays_naissance'].upper()).upper()
    }
    filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v) and v != ''}
    return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)

df_all_valid['allocataire'] = df_all_valid.apply(to_json_allocataire_without_null, axis=1)

In [None]:
# Some rows don't have any adresse-allocataire_voie, these aren't included in the json object subsequently
df_all_valid['adresse-allocataire_voie'].fillna('', inplace=True)

def to_json_adresse_without_null(row):
    adresse_mapping = {
        'voie': unaccent_and_upper(row['adresse-allocataire_voie'].strip()).replace('"', '\''),
        'commune': unaccent_and_upper(row['adresse-allocataire_commune'].strip()),
        'code_postal': format_insee_or_postal_code(row['adresse-allocataire_code_postal']),
        'code_insee': format_insee_or_postal_code(row['adresse-allocataire_code_insee']),
        'cplt_adresse': row['adresse-allocataire_cplt_adresse'],
    }
    
    filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v) and v != ''}
    return json.dumps(filtered_address, ensure_ascii=False)

df_all_valid['adresse_allocataire'] = df_all_valid.apply(to_json_adresse_without_null, axis=1)

In [None]:
df_all_valid[df_all_valid['adresse-allocataire_voie'] == '']

In [None]:
existing_codes_filepath = os.environ['EXISTING_CODES_PATHFILE_2025']
existing_codes = pd.read_csv('/Users/patricknguyen/Desktop/betagouv/pass-sport/data/2025/aeeh/previous_waves/wave-2/2025-09-19-existing-codes-2025.csv', on_bad_lines='skip', sep=',', engine="c")

In [None]:
# Unique codes generation
import random
import string
import datetime

current_date = datetime.datetime.now()
current_year = str(current_date.year)[-2:]

def get_characters_set(size = 4):
    return ''.join(random.choices([c for c in string.ascii_uppercase if c not in 'OI'], k=size))

def generate_code():
    return f"{current_year}-{get_characters_set(4)}-{get_characters_set(4)}"

# init set of codes with existing
unique_codes = set(existing_codes['code'])

# init current_code count
current_codes_count = len(unique_codes)

while len(unique_codes) < (len(df_all_valid) + len(existing_codes)):
    unique_codes.add(generate_code())

In [None]:
# Ensure we have generated codes for all the rows
assert len(unique_codes) == (len(df_all_valid) + len(existing_codes))

In [None]:
# Assign generated code for production data
new_codes = unique_codes.difference(set(existing_codes['code']))
assert len(new_codes) == len(df_all_valid)

In [None]:
df_all_valid['id_psp'] = list(new_codes)

In [None]:
# Add missing default columns needed for target DB model
timestamp_with_custom_tz = pd.Timestamp.now(tz='Europe/Paris')

df_all_valid.loc[:,'exercice_id'] = exercice_id
df_all_valid.loc[:,'uuid_doc'] = np.NaN
df_all_valid[['zrr', 'qpv', 'a_valider', 'refuser']] = False
df_all_valid[['created_at', 'updated_at']] = timestamp_with_custom_tz
len(df_all_valid)

In [None]:
counts = df_all_valid['allocataire-courriel'].value_counts()
duplicates = counts[counts > 1]
print(f"{len(duplicates)} total duplicates")
df_all_valid = df_all_valid.drop_duplicates(subset=['allocataire-courriel'], keep=False)
len(df_all_valid)
print(f"{len(df_all_valid)} students to inject")

In [None]:
# output to CSV
db_columns = ['nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']
df_all_valid[db_columns].to_csv(base_output_filepath)