# Code generation
Notebook to generate codes for two files that have been processed separately (DS and CNOUS)

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv

from utils.data_utils import get_current_date_for_file_name

load_dotenv()

cnous_pathfile = os.environ['CNOUS_PATHFILE_2025']
ds_pathfile = os.environ['DS_PATHFILE_2025']
existing_codes_pathfile = os.environ['EXISTING_CODES_PATHFILE_2025']

df_cnous = pd.read_csv(cnous_pathfile, sep=';', encoding='utf-8', dtype=str)
df_aeeh = pd.read_csv(ds_pathfile, sep=';', encoding='utf-8', dtype=str)
df_existing_codes = pd.read_csv(existing_codes_pathfile, on_bad_lines='skip', sep=',', dtype=str, engine="c", keep_default_na=False, encoding="utf-8")

In [None]:
df_merged = pd.concat([df_cnous, df_aeeh])

# Drop old generated codes because they cannot be unique
df_merged.drop(columns='id_psp', inplace=True)

In [None]:
# Unique codes generation
import random
import string
import datetime

df_merged = df_merged.reset_index(drop=True)
current_date = datetime.datetime.now()
current_year = str(current_date.year)[-2:]

def get_characters_set(size=4):
    return ''.join(random.choices([c for c in string.ascii_uppercase if c not in 'OI'], k=size))


def generate_code():
    return f"{current_year}-{get_characters_set(4)}-{get_characters_set(4)}"

# init set of codes with existing
unique_codes = set(df_existing_codes['code'])
current_codes_count = len(unique_codes)

while len(unique_codes) < (len(df_merged) + len(df_existing_codes)):
    unique_codes.add(generate_code())

In [None]:
# Ensure we have generated codes for all the rows
assert len(unique_codes) == (len(df_merged) + len(df_existing_codes))
new_codes = unique_codes.difference(set(df_existing_codes['code']))

In [None]:
assert len(new_codes) == len(df_merged)
len(new_codes)

In [None]:
# Assign generated code
df_merged['id_psp'] = list(new_codes)

In [None]:
db_columns = ['nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']
db_columns_with_dossier = ['dossier_id', 'nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']

In [None]:
cnous_pathfile = os.environ['CNOUS_PATHFILE_2025']
ds_pathfile = os.environ['DS_PATHFILE_2025']
existing_codes_pathfile = os.environ['EXISTING_CODES_PATHFILE_2025']

df_merged[df_merged['situation'] == 'boursier'][db_columns].to_csv(get_current_date_for_file_name('cnous.csv'), sep=';', index=False, encoding='utf-8')
df_merged[~(df_merged['situation'] == 'boursier')][db_columns].to_csv(get_current_date_for_file_name('aeeh.csv'), sep=';', index=False, encoding='utf-8')

# Keep DS csv with dossier ids
df_merged[~(df_merged['situation'] == 'boursier')][db_columns_with_dossier].to_csv(get_current_date_for_file_name('aeeh-with-dossier-ids.csv'), sep=';', index=False, encoding='utf-8')