# Summary
Script to run against the MSA and CNAF files that have gone through cleaning/formatting process

This notebook merge benef and backup benefs and set the default column values such as:
- exercice_id, uuid_doc, zrr, qpv, a_valider, refuser, created_at, updated_at

It also merges data from 2024 if they do not match against anything from the 2025 data,
if there 2024 data matches against the 2025 data, we keep the latter.

Uniques codes are also generated for each of the rows and assigned to column "id_psp"

In [None]:
import os
import pandas as pd
import numpy as np
import csv

from dotenv import load_dotenv

In [None]:
load_dotenv()

cnaf_msa_2024_filepath = os.environ['DB_CNAF_MSA_2024_WITHOUT_RGPD']
cnaf_export_2025_filepath = os.environ['DB_CNAF_EXPORT_2025']
msa_export_2025_filepath = os.environ['DB_MSA_EXPORT_2025']

backup_cnaf_export_filepath = os.environ['DB_BACKUP_CNAF_EXPORT_2025']
backup_msa_export_filepath = os.environ['DB_BACKUP_MSA_EXPORT_2025']

final_db_export_output_filepath = os.environ['FINAL_DB_EXPORT_2025']
final_db_backup_export_output_filepath = os.environ['FINAL_DB_BACKUP_EXPORT_2025']

In [None]:
# keep_default_na is necessary otherwise string such as "NA" is considered as NaN...
df_cnaf_msa_2024 = pd.read_csv(cnaf_msa_2024_filepath, sep=';', encoding='utf-8', dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
df_cnaf_2025 = pd.read_csv(cnaf_export_2025_filepath, sep=';', encoding='utf-8', dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
df_msa_2025= pd.read_csv(msa_export_2025_filepath, sep=';', encoding='utf-8', dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)

In [None]:
assert(len(df_cnaf_msa_2024[df_cnaf_msa_2024['prenom'].isna() | df_cnaf_msa_2024['prenom'].isna()]) == 0)
assert(len(df_cnaf_2025[df_cnaf_2025['prenom'].isna() | df_cnaf_2025['prenom'].isna()]) == 0)
assert(len(df_msa_2025[df_msa_2025['prenom'].isna() | df_msa_2025['prenom'].isna()]) == 0)

In [None]:
backup_df_cnaf_2025 = pd.read_csv(backup_cnaf_export_filepath, sep=';', encoding='utf-8')
backup_df_msa_2025 = pd.read_csv(backup_msa_export_filepath, sep=';', encoding='utf-8')

In [None]:
merged_df_from_2025 = pd.concat([df_cnaf_2025, df_msa_2025], ignore_index=True).reset_index(drop=True)
merged_backup_from_2025_df = pd.concat([backup_df_cnaf_2025, backup_df_msa_2025], ignore_index=True).reset_index(drop=True)

In [None]:
merged_df_from_2025['date_naissance_to_compare'] = pd.to_datetime(merged_df_from_2025['date_naissance']).dt.date
df_cnaf_msa_2024['date_naissance_to_compare'] = pd.to_datetime(df_cnaf_msa_2024['date_naissance']).dt.date

# Put a date value for the 2025 data otherwise the merge will not work as intended (the dates from 2024 will replace the non existing dates on data from 2025)
timestamp_with_custom_tz = pd.Timestamp.now(tz='Europe/Paris')
merged_df_from_2025[['created_at', 'updated_at']] = timestamp_with_custom_tz

In [None]:
# indicator True in order to take data from 2024
merged_old_and_new_data = pd.merge(df_cnaf_msa_2024, merged_df_from_2025, how='left', on=['prenom', 'nom', 'date_naissance_to_compare'], suffixes=('', '_old'), indicator=True)

# left_only means the data from 2024 was not found in data from 2025, so we take it
data_from_2024_not_existing_in_2025 = merged_old_and_new_data[merged_old_and_new_data['_merge'] == 'left_only']

In [None]:
# Exclude columns created by the merge and the indicator option
columns_to_keep = [col for col in data_from_2024_not_existing_in_2025.columns if not col.endswith('_old') and col != '_merge']
data_from_2024_not_existing_in_2025 = data_from_2024_not_existing_in_2025[columns_to_keep]

In [None]:
# Merge data from 2025 and data from 2024 (that don't exist in 2025 data)
final_df = pd.concat([merged_df_from_2025, data_from_2024_not_existing_in_2025], ignore_index=True).reset_index(drop=True)

In [None]:
# Add missing default column needed to production data
exercice_2025 = 4

final_df['exercice_id'] = exercice_2025
final_df['uuid_doc'] = np.NaN
final_df[['zrr', 'qpv', 'a_valider', 'refuser']] = False

In [None]:
# Add missing default column needed to backup data
merged_backup_from_2025_df['exercice_id'] = exercice_2025
merged_backup_from_2025_df[['id_psp', 'uuid_doc']] = np.NaN
merged_backup_from_2025_df[['zrr', 'qpv', 'a_valider', 'refuser']] = False
merged_backup_from_2025_df[['created_at', 'updated_at']] = timestamp_with_custom_tz

In [None]:
# Unique codes generation
import random
import string
import datetime

current_date = datetime.datetime.now()
current_year = str(current_date.year)[-2:]

def get_characters_set(size = 4):
    return ''.join(random.choices([c for c in string.ascii_uppercase if c not in 'OI'], k=size))

def generate_code():
    return f"{current_year}-{get_characters_set(4)}-{get_characters_set(4)}"

# init set of codes with existing
unique_codes = set()

# init current_code count
current_codes_count = len(unique_codes)

while len(unique_codes) < len(final_df):
    unique_codes.add(generate_code())

# Ensure we have generated codes for all the rows
assert len(unique_codes) == len(final_df)

In [None]:
# Assign generated code for production data
final_df['id_psp'] = list(unique_codes)

In [None]:
print(f"{len(df_cnaf_msa_2024)} benefs from 2024 (msa + cnaf)")
print(f"{len(df_cnaf_msa_2024[df_cnaf_msa_2024['genre'] == 'M'])} M benefs from 2024")
print(f"{len(df_cnaf_msa_2024[df_cnaf_msa_2024['genre'] == 'F'])} F benefs from 2024")

In [None]:
print(f"{len(merged_df_from_2025)} benefs from 2025 (msa + cnaf)")
print(f"{len(merged_df_from_2025[merged_df_from_2025['genre'] == 'M'])} M benefs from 2025")
print(f"{len(merged_df_from_2025[merged_df_from_2025['genre'] == 'F'])} F benefs from 2025")

In [None]:
print(f"{len(unique_codes)} unique codes generated for this year")
print(f"{len(final_df)} benefs from 2024 + 2025 (msa + cnaf)")
print(f"{len(final_df[final_df['genre'] == 'M'])} M benefs from 2024 + 2025")
print(f"{len(final_df[final_df['genre'] == 'F'])} F benefs from 2024 + 2025")

In [None]:
print(f"{len(df_cnaf_msa_2024[df_cnaf_msa_2024['situation'] == 'AAH'])} AAH benefs from 2024 (msa + cnaf)")
print(f"{len(merged_df_from_2025[merged_df_from_2025['situation'] == 'AAH'])} AAH benefs from 2025 (msa + cnaf)")
print(f"{len(final_df[final_df['situation'] == 'AAH'])} AAH benefs from 2024 + 2025 (msa + cnaf)")

In [None]:
print(f"{len(df_cnaf_msa_2024[df_cnaf_msa_2024['situation'] == 'jeune'])} jeune benefs from 2024 (msa + cnaf)")
print(f"{len(merged_df_from_2025[merged_df_from_2025['situation'] == 'jeune'])} jeune benefs from 2025 (msa + cnaf)")
print(f"{len(final_df[final_df['situation'] == 'jeune'])} jeune benefs from 2024 + 2025 (msa + cnaf)")

In [None]:
print(f"{len(merged_backup_from_2025_df)} benefs from backup data 2025")

In [None]:
final_df.drop(columns=['date_naissance_to_compare'], inplace=True)

In [None]:
mask_jeune = final_df['situation'] == 'jeune'
mask_caf = final_df['organisme'] == 'CAF'
len(final_df[mask_jeune & mask_caf])

In [None]:
merged_backup_from_2025_df['is_backup'] = True

In [None]:
final_df.to_csv(final_db_export_output_filepath, sep=';', index=False, encoding='utf-8')
merged_backup_from_2025_df.to_csv(final_db_backup_export_output_filepath, sep=';', index=False, encoding='utf-8')