# Process
- Use the first script "clean_cnous" on all cnous files to have properly formatted files (injection ready in database)
- Then use this script to product the final output with deduplicated beneficiaries based on their matricule (INE) and to generate unique codes


In [None]:
import os
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

exercice_id = 4

cnous_output_filepath = os.environ['CNOUS_2_OUTPUT_PATHFILE_2025']
cnous_1_filepath = os.environ['CNOUS_1_PATHFILE_2025']
cnous_2_filepath = os.environ['CNOUS_2_PATHFILE_2025']

df_cnous_1 = pd.read_csv(cnous_1_filepath, encoding='utf-8', on_bad_lines='skip', sep=';', engine="c", dtype=str)
df_cnous_2 = pd.read_csv(cnous_2_filepath, encoding='utf-8', on_bad_lines='skip', sep=';', engine="c", dtype=str)

In [None]:
import json

# Unwrap allocataire json object to get "matricule" data
df_cnous_1_allocataire_json = pd.json_normalize(df_cnous_1['allocataire'].apply(json.loads)).add_prefix('allocataire-')
df_cnous_1_unwrapped = pd.merge(df_cnous_1, df_cnous_1_allocataire_json, left_index=True, right_index=True)

df_cnous_2_allocataire_json = pd.json_normalize(df_cnous_2['allocataire'].apply(json.loads)).add_prefix('allocataire-')
df_cnous_2_unwrapped = pd.merge(df_cnous_2, df_cnous_2_allocataire_json, left_index=True, right_index=True)

print(f"{len(df_cnous_1_unwrapped)} benefs from wave 1")
print(f"{len(df_cnous_2_unwrapped)} benefs from wave 2")

In [None]:
df_cnous_merged = pd.merge(df_cnous_1_unwrapped, df_cnous_2_unwrapped, on='allocataire-matricule', how='right', suffixes=("_old", ""), indicator=True)
df_cnous_merged = df_cnous_merged[df_cnous_merged._merge == 'right_only']

print(f"{len(df_cnous_merged)} benef to be injected")

In [None]:
df_cnous_merged[df_cnous_merged['allocataire-matricule'].isna()]

In [None]:
assert(len(df_cnous_merged[df_cnous_merged['allocataire-matricule'].isin(df_cnous_1_unwrapped['allocataire-matricule'])]) == 0)
len(df_cnous_2_unwrapped)

In [None]:
print(f"{len(df_cnous_merged)} before deduplicating on nom, prenom, date_naissance")
df_cnous_merged.drop_duplicates(subset=['nom', 'prenom', 'date_naissance'], keep='first', inplace=True)
print(f"{len(df_cnous_merged)} after deduplicating on nom, prenom, date_naissance")

In [None]:
existing_codes_filepath = os.environ['EXISTING_CODES_PATHFILE_2025']
df_existing_codes = pd.read_csv(existing_codes_filepath, on_bad_lines='skip', sep=',', engine="c")

In [None]:
# Unique codes generation
import random
import string
import datetime

current_date = datetime.datetime.now()
current_year = str(current_date.year)[-2:]

def get_characters_set(size = 4):
    return ''.join(random.choices([c for c in string.ascii_uppercase if c not in 'OI'], k=size))

def generate_code():
    return f"{current_year}-{get_characters_set(4)}-{get_characters_set(4)}"

# init set of codes with existing
unique_codes = set(df_existing_codes['code'])

# init current_code count
current_codes_count = len(unique_codes)

while len(unique_codes) < (len(df_cnous_merged) + len(df_existing_codes)):
    unique_codes.add(generate_code())

In [None]:
# Ensure we have generated codes for all the rows
assert len(unique_codes) == (len(df_cnous_merged) + len(df_existing_codes))

In [None]:
# Assign generated code for production data
new_codes = unique_codes.difference(set(df_existing_codes['code']))
assert len(new_codes) == len(df_cnous_merged)

In [None]:
df_cnous_merged['id_psp'] = list(new_codes)

In [None]:
# output to CSV
db_columns = ['nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']
df_cnous_merged[db_columns].to_csv(cnous_output_filepath, sep=';', index=False, encoding='utf-8')

In [None]:
# Debugging purposes
df_cnous_merged[df_cnous_merged['allocataire-matricule'].isna()]