In [None]:
from utils.data_utils import unaccent_and_upper, format_insee_or_postal_code
import os
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
import numpy as np

load_dotenv()

filepath = os.environ['FSS_PATHFILE_2025']
base_output_filepath = os.environ['DB_FSS_EXPORT_2025']
exercice_id = 4

columns = [
  'code_operateur',
  'nom',
  'prenom',
  'date_naissance',
  'lieu_naissance',
  'genre',
  'courriel',
  'statut_de_boursier',
  'niveau',
  'echelon',
  'date_debut_rentree',
  'duree_versement',
  "nom_etablissement",
  "ville_etudes",
  'statut_du_boursier',
  'numero_dossier_bourse',
  'matricule',
  "nom_usage",
  "uairne_etablissement",
  'date_notification_bourse',
  "date_effet_bourse",
  'date_radiation',
  'radiation',
  'commune_code_postal',
  'commune_code_insee',
  'commune_naissance_code_postal'
]

df = pd.read_csv(filepath, encoding='utf-8', on_bad_lines='skip', sep=';', engine="c", dtype=str, names=columns, header=0)

In [None]:
df['date_radiation'].dropna(inplace=True)

In [None]:
# Cleaning and formatting
df['nom'] = df['nom'].astype(str).str.strip().apply(unaccent_and_upper)
df['prenom'] = df['prenom'].astype(str).str.strip().apply(unaccent_and_upper)
df['qualite'] = df['genre'].replace('F', 'Mme')
df['organisme'] = 'cnous'
df['situation'] = 'boursier'
df['courriel'] = df['courriel'].str.lower()
df['date_naissance'] = pd.to_datetime(df['date_naissance'], format='%d/%m/%Y')
df['allocataire-date_naissance'] =  pd.to_datetime(df['date_naissance'], format='%d/%m/%Y').dt.strftime('%d/%m/%Y')
df['commune_naissance'] = format_insee_or_postal_code(df['lieu_naissance'])

In [None]:
# apply criterias on CNOUS datas
from datetime import timedelta

mask_dob_start = df['date_naissance'] >= datetime(1997, 1, 1)
mask_dob_end = df['date_naissance'] <= datetime(2025, 12, 31)
mask_dob = mask_dob_start & mask_dob_end

df_filtered = df[mask_dob]

print(f"{len(df) - len(df_filtered)} rows for CNOUS dataframe were removed based on criterias")
print(f"{len(df_filtered)}")

In [None]:
# remove rows with missing necessary values (if one of those value are missing we cannot generate a code)
necessary_column = ['nom', 'prenom', 'date_naissance', 'genre']
df_all_valid_row = df_filtered.dropna(subset=necessary_column)

# remove columns with all null value
df_all_valid = df_all_valid_row.dropna(axis=1, how='all')

print(f"{len(df_all_valid)}")
assert len(df_all_valid[df_filtered['nom'].isnull() | df_all_valid['prenom'].isnull() | df_all_valid['date_naissance'].isnull()]) == 0

In [None]:
# add 4h on all birthdates
df_all_valid.loc[:,'date_naissance'] = df_all_valid['date_naissance'] + timedelta(hours=4)

In [None]:
# map to json values for target DB model
## map allocataire json
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': row['qualite'],
        'nom': unaccent_and_upper(row['nom']),
        'prenom': unaccent_and_upper(row['prenom']),
        'date_naissance': row['allocataire-date_naissance'],
        'courriel': row['courriel'],
        'commune_naissance': format_insee_or_postal_code(row['commune_naissance_code_postal']),
        'matricule': row['matricule'],
        # These do not exist in the source files
        # 'code_insee_commune_naissance': format_insee_or_postal_code(row['commune_code_insee']),
        # 'code_iso_pays_naissance': row['allocataire-code_iso_pays_naissance'].upper(),
        # 'pays_naissance': get_country_from_iso(row['allocataire-code_iso_pays_naissance'].upper()).upper()
    }
    filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v) and v != ''}
    return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)

df_all_valid['allocataire'] = df_all_valid.apply(to_json_allocataire_without_null, axis=1)

In [None]:
def to_json_adresse_without_null(row):
    adresse_mapping = {
        'code_postal': format_insee_or_postal_code(row['commune_code_postal']),
        # These do not exist in the source files
        # 'voie': unaccent_and_upper(row['adresse-allocataire_voie'].strip()).replace('"', '\''),
        # 'commune': unaccent_and_upper(row['adresse-allocataire_commune'].strip()),
        # 'code_insee': format_insee_or_postal_code(row['commune_code_insee']),
    }
    
    filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v) and v != ''}
    return json.dumps(filtered_address, ensure_ascii=False)

df_all_valid['adresse_allocataire'] = df_all_valid.apply(to_json_adresse_without_null, axis=1)

In [None]:
existing_codes_filepath = os.environ['EXISTING_CODES_PATHFILE_2025']
existing_codes = pd.read_csv(existing_codes_filepath, on_bad_lines='skip', sep=',', engine="c")

In [None]:
# Unique codes generation
import random
import string
import datetime

current_date = datetime.datetime.now()
current_year = str(current_date.year)[-2:]

def get_characters_set(size = 4):
    return ''.join(random.choices([c for c in string.ascii_uppercase if c not in 'OI'], k=size))

def generate_code():
    return f"{current_year}-{get_characters_set(4)}-{get_characters_set(4)}"

# init set of codes with existing
unique_codes = set(existing_codes['code'])

# init current_code count
current_codes_count = len(unique_codes)

while len(unique_codes) < (len(df_all_valid) + len(existing_codes)):
    unique_codes.add(generate_code())

In [None]:
# Ensure we have generated codes for all the rows
assert len(unique_codes) == (len(df_all_valid) + len(existing_codes))

In [None]:
# Assign generated code for production data
new_codes = unique_codes.difference(set(existing_codes['code']))
assert len(new_codes) == len(df_all_valid)

In [None]:
df_all_valid['id_psp'] = list(new_codes)

In [None]:
# Add missing default columns needed for target DB model
timestamp_with_custom_tz = pd.Timestamp.now(tz='Europe/Paris')

df_all_valid.loc[:,'exercice_id'] = exercice_id
df_all_valid.loc[:,'uuid_doc'] = np.NaN
df_all_valid[['zrr', 'qpv', 'a_valider', 'refuser']] = False
df_all_valid[['created_at', 'updated_at']] = timestamp_with_custom_tz
len(df_all_valid)

In [None]:
counts = df_all_valid['courriel'].value_counts()
duplicates = counts[counts > 1]
print(f"{len(duplicates)} total duplicates")
df_all_valid = df_all_valid.drop_duplicates(subset=['courriel'], keep=False)
len(df_all_valid)
print(f"{len(df_all_valid)} students to inject")

In [None]:
# output to CSV
db_columns = ['nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']

df_all_valid[db_columns].to_csv(base_output_filepath, sep=';', index=False, encoding='utf-8')