# Traitements des fichiers des organismes


## Pipeline
1. Chargement des fichiers CNOUS
2. Nettoyage des données et premier mapping au bon format de données attendu dans la BDD
3. Application des critères sur les données du CNOUS
4. Cleanup (date de naissance + 4 heures)
5Mapping des json
5. Ajout des valeurs pour les colonnes par défault
6. Output to CSV

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
import numpy as np

load_dotenv()

cnous_filepath = os.environ['CNOUS_PATHFILE']

base_output_filepath = os.environ['DB_EXPORT']

MEMORY_OPTIMIZATION = False

In [None]:
cnous_column_type = {
  'lieuNaissCodeCommuneInsee': 'str',
  'lieuNaissCodePays': 'str',
  'adresseCodePostal': 'str'
}

# UTF-8 encoding by default since the csv has been filtered in step 0_cnous_dedupe.ipynb
cnous_df = pd.read_csv(cnous_filepath, encoding='Windows-1252', on_bad_lines='skip', sep=';', engine="c",
                       dtype=cnous_column_type)

In [None]:
# map CNOUS
cnous_column_mapping = {
  # infos allocataire
  'ine': 'allocataire-matricule',
  # pas de code organisme
  'civiliteLibelleCourt': 'allocataire-qualite',
  'nom': 'allocataire-nom',
  'prenom': 'allocataire-prenom',
  'dateNaissance': 'allocataire-date_naissance',
  'mail': 'allocataire-courriel',
  'lieuNaissCodeCommuneInsee': 'allocataire-code_insee_commune_naissance',
  'lieuNaissLibelleCommune': 'allocataire-commune_naissance',
  'lieuNaissCodePays': 'allocataire-code_iso_pays_naissance',
  'lieuNaissLibellePays': 'allocataire-pays_naissance',

  # adresse allocataire
  'adresseVoie': 'adresse_allocataire-voie',
  'adresseCodePostal': 'adresse_allocataire-code_postal',
  'adresseLocalite': 'adresse_allocataire-commune',
  'adresseCodeLocalite': 'adresse_allocataire-code_insee',
  'adresseComplement1': 'adresse_allocataire-cplt_adresse',

  # infos bénéficiaires
  'genre': 'civiliteLibelleCourt',
}

# Drop unused column
df_psp_mapped_cnous = cnous_df.copy()
df_psp_mapped_cnous.drop(columns=['adresseCodePays', 'adresseComplement2'], inplace=True)

df_psp_mapped_cnous.rename(columns=cnous_column_mapping, inplace=True)

# organisme
df_psp_mapped_cnous['organisme'] = 'cnous'
df_psp_mapped_cnous['situation'] = 'boursier'

# remove weird values
df_psp_mapped_cnous = df_psp_mapped_cnous[df_psp_mapped_cnous['allocataire-date_naissance'] != 'dateNaissance']

# infos bénéficiaires = allocataire
df_psp_mapped_cnous['date_naissance'] = pd.to_datetime(df_psp_mapped_cnous['allocataire-date_naissance'],
                                                       format='%d/%m/%Y')
df_psp_mapped_cnous['allocataire-date_naissance'] = df_psp_mapped_cnous['date_naissance'].dt.strftime('%d/%m/%Y')
df_psp_mapped_cnous['nom'] = df_psp_mapped_cnous['allocataire-nom']
df_psp_mapped_cnous['prenom'] = df_psp_mapped_cnous['allocataire-prenom']
df_psp_mapped_cnous['genre'] = df_psp_mapped_cnous['allocataire-qualite']
df_psp_mapped_cnous['allocataire-qualite'] = df_psp_mapped_cnous['allocataire-qualite'].replace('F', 'Mme')

if MEMORY_OPTIMIZATION:
  del cnous_df

In [None]:
# Replace numerical code_iso_pays_naissance values to code iso 3166
# Country code mapping
country_codes = {
    "FRANCE": "FR",
    "ITALIE": "IT",
    "ALGERIE": "DZ",
    "ESPAGNE": "ES",
    "MAROC": "MA",
    "UKRAINE": "UA",
    "PORTUGAL": "PT",
    "MADAGASCAR": "MG",
    "TUNISIE": "TN",
    "RUSSIE": "RU",
    "SENEGAL": "SN",
    "SYRIE": "SY",
    "COMORES": "KM",
    "COTE D'IVOIRE": "CI",
    "GUERNESEY": "GG",
    "CONGO (REPUBLIQUE DEMOCRATIQUE)": "CD",
    "CAMEROUN": "CM",
    "HAITI": "HT",
    "ROUMANIE": "RO",
    "TURQUIE": "TR",
    "BELGIQUE": "BE",
    "ARMENIE": "AM",
    "CONGO": "CG",
    "ALLEMAGNE": "DE",
    "GUINEE": "GN",
    "SRI LANKA": "LK",
    "CHINE": "CN",
    "ETATS-UNIS": "US",
    "ALBANIE": "AL",
    "POLOGNE": "PL",
    "LIBAN": "LB",
    "MOLDAVIE": "MD",
    "AFGHANISTAN": "AF",
    "VIET NAM": "VN",
    "MALI": "ML",
    "BRESIL": "BR",
    "GABON": "GA",
    "GEORGIE": "GE",
    "GOA": "IN",
    "EGYPTE": "EG",
    "IRAQ": "IQ",
    "KOSOVO": "XK",
    "BENIN": "BJ",
    "PAKISTAN": "PK",
    "ARUBA": "AW",
    "BANGLADESH": "BD",
    "MONACO": "MC",
    "TOGO": "TG",
    "SUISSE": "CH",
    "BULGARIE": "BG",
    "ANGOLA": "AO",
    "COLOMBIE": "CO",
    "CENTRAFRICAINE (REPUBLIQUE)": "CF",
    "GRECE": "GR",
    "CANADA": "CA",
    "ETHIOPIE": "ET",
    "NIGERIA": "NG",
    "TCHAD": "TD",
    "MAURICE": "MU",
    "IRAN": "IR",
    "SERBIE": "RS",
    "MEXIQUE": "MX",
    "BURKINA": "BF",
    "MAURITANIE": "MR",
    "VENEZUELA": "VE",
    "SOUDAN": "SD",
    "THAILANDE": "TH",
    "RWANDA": "RW",
    "DJIBOUTI": "DJ",
    "LUXEMBOURG": "LU",
    "PEROU": "PE",
    "ARABIE SAOUDITE": "SA",
    "HONGRIE": "HU",
    "AZERBAIDJAN": "AZ",
    "TCHEQUIE": "CZ",
    "IRLANDE, ou EIRE": "IE",
    "CHILI": "CL",
    "ANDORRE": "AD",
    "DOMINICAINE (REPUBLIQUE)": "DO",
    "JAPON": "JP",
    "CAMBODGE": "KH",
    "MONGOLIE": "MN",
    "KAZAKHSTAN": "KZ",
    "LIBYE": "LY",
    "PHILIPPINES": "PH",
    "ARGENTINE": "AR",
    "CAP-VERT": "CV",
    "BOSNIE-HERZEGOVINE": "BA",
    "ISRAEL": "IL",
    "EMIRATS ARABES UNIS": "AE",
    "SOMALIE": "SO",
    "AFRIQUE DU SUD": "ZA",
    "BIELORUSSIE": "BY",
    "EQUATEUR": "EC",
    "BURUNDI": "BI",
    "BOLIVIE": "BO",
    "COREE (REPUBLIQUE DE)": "KR",
    "INDONESIE": "ID",
    "SLOVAQUIE": "SK",
    "GUINEE-BISSAU": "GW",
    "YEMEN": "YE",
    "SURINAME": "SR",
    "AUTRICHE": "AT",
    "NIGER": "NE",
    "GUATEMALA": "GT",
    "ERYTHREE": "ER",
    "LETTONIE": "LV",
    "KENYA": "KE",
    "CHRISTMAS (ILE)": "CX",
    "SVALBARD et ILE JAN MAYEN": "SJ",
    "GHANA": "GH",
    "LITUANIE": "LT",
    "SUEDE": "SE",
    "MACEDOINE DU NORD": "MK",
    "PALESTINE (Etat de)": "PS",
    "NEPAL": "NP",
    "FINLANDE": "FI",
    "CUBA": "CU",
    "JORDANIE": "JO",
    "MALAISIE": "MY",
    "LAOS": "LA",
    "SAINTE-LUCIE": "LC",
    "TAIWAN": "TW",
    "CHYPRE": "CY",
    "OUZBEKISTAN": "UZ",
    "PARAGUAY": "PY",
    "ROYAUME-UNI": "GB",
    "NICARAGUA": "NI",
    "COOK (ILES)": "CK",
    "GUYANA": "GY",
    "FEROE (ILES)": "FO",
    "MONTENEGRO": "ME",
    "COSTA RICA": "CR",
    "SINGAPOUR": "SG",
    "TADJIKISTAN": "TJ",
    "KIRGHIZISTAN": "KG",
    "EL SALVADOR": "SV",
    "GAMBIE": "GM",
    "KOWEIT": "KW",
    "URUGUAY": "UY",
    "CROATIE": "HR",
    "HONDURAS": "HN",
    "SIERRA LEONE": "SL",
    "DOMINIQUE": "DM",
    "ESTONIE": "EE",
    "OUGANDA": "UG",
    "QATAR": "QA",
    "ACORES, MADERE": "PT",
    "GUINEE EQUATORIALE": "GQ",
    "SAHARA OCCIDENTAL": "EH",
    "MOZAMBIQUE": "MZ",
    "ZIMBABWE": "ZW",
    "BAHREIN": "BH",
    "OMAN": "OM",
    "TANZANIE": "TZ",
    "SEYCHELLES": "SC",
    "SLOVENIE": "SI",
    "TURKMENISTAN": "TM",
    "CANARIES (ILES)": "ES",
    "NAMIBIE": "NA",
    "LIBERIA": "LR",
    "ZAMBIE": "ZM",
    "JAMAIQUE": "JM",
    "MALTE": "MT",
    "BELIZE": "BZ",
    "ANTILLES NEERLANDAISES": "AN",
    "COREE": "KP",
    "PANAMA": "PA",
    "VANUATU": "VU",
    "BIRMANIE": "MM",
    "FIDJI": "FJ",
    "SAO TOME-ET-PRINCIPE": "ST",
    "TRINITE-ET-TOBAGO": "TT",
    "BARBADE": "BB",
    "SAO TOME": "ST",
    "SAINT-MARIN": "SM",
    "LIECHTENSTEIN": "LI",
    "BRUNEI": "BN",
    "INDE": "IN",
    "CAIMANES (ILES)": "KY",
    "TERR. DES ETATS-UNIS D'AMERIQUE EN OCEANIE": "UM",
    "HONG-KONG": "HK",
    "PAYS-BAS": "NL"
}

df_psp_mapped_cnous['allocataire-code_iso_pays_naissance'] = df_psp_mapped_cnous[
  'allocataire-pays_naissance'].replace(country_codes)

In [None]:
# apply criterias on CNOUS datas
from datetime import timedelta
from dateutil.relativedelta import relativedelta

# Cut off date for eligibility for year 2024 
end_date = pd.to_datetime('2024-10-15').date()
start_date = end_date - relativedelta(years=28)

cnous_situation_mask = (df_psp_mapped_cnous['date_naissance'].dt.date >= start_date) & (
    df_psp_mapped_cnous['date_naissance'].dt.date <= end_date)
df_psp_mapped_cnous_filtered = df_psp_mapped_cnous[cnous_situation_mask]

print(
  f"{len(df_psp_mapped_cnous) - len(df_psp_mapped_cnous_filtered)} rows for CNOUS dataframe were removed based on criterias")

In [None]:
if MEMORY_OPTIMIZATION:
  del df_psp_mapped_cnous

# Merge dans un seul dataframe cible pour BDD Postgresql

In [None]:
# concat into a single dataframe
df_all = pd.concat([df_psp_mapped_cnous_filtered], axis=0, ignore_index=True)

# remove rows with missing necessary values (if one of those value are missing we cannot generate a code)
necessary_column = ['nom', 'prenom', 'date_naissance', 'genre']
df_all_valid_row = df_all.dropna(subset=necessary_column)

# remove columns with all null value
df_all_valid = df_all_valid_row.dropna(axis=1, how='all')

assert len(
  df_all_valid[df_all['nom'].isnull() | df_all_valid['prenom'].isnull() | df_all_valid['date_naissance'].isnull()]) == 0

In [None]:
# Upper case these columns for the merge
df_all_valid['prenom'] = df_all_valid['prenom'].astype(str).apply(lambda x: x.upper())
df_all_valid['nom'] = df_all_valid['nom'].astype(str).apply(lambda x: x.upper())
df_all_valid['genre'] = df_all_valid['genre'].astype(str).apply(lambda x: x.upper())

In [None]:
# lower case on emails on all
df_all_valid['allocataire-courriel'] = df_all_valid['allocataire-courriel'].str.lower()

In [None]:
# remove rows when beneficiary is before september 1993
mask_before_1993 = pd.to_datetime(df_all_valid['date_naissance']) > datetime(1993, 9, 16)
df_all_valid_after93 = df_all_valid[mask_before_1993]

print(f"{len(df_all_valid) - len(df_all_valid_after93)} rows where removed because date_naissance was before 1993")

In [None]:
if MEMORY_OPTIMIZATION:
  del df_psp_mapped_cnous_filtered

In [None]:
# add 4h on all birthdates
df_all_valid_after93.loc[:, 'date_naissance'] = df_all_valid_after93['date_naissance'] + timedelta(hours=4)

In [None]:
# remove duplicate beneficiaries
df_all_valid_no_duplicate = df_all_valid_after93.drop_duplicates(subset=[
  'date_naissance',
  'nom',
  'prenom',
  'genre',
  'organisme',
  'situation',
  'allocataire-qualite',
  'allocataire-matricule',
  'allocataire-prenom',
  'allocataire-date_naissance',
  'allocataire-courriel',
  'allocataire-code_insee_commune_naissance',
  'allocataire-commune_naissance',
  'allocataire-code_iso_pays_naissance',
  'allocataire-pays_naissance'
])

print(f"{len(df_all_valid_after93) - len(df_all_valid_no_duplicate)} duplicate rows were removed")

In [None]:
# map to json values for target DB model 
## map allocataire json
def to_json_allocataire_without_null(row):
  allocataire_mapping = {
    'qualite': row['allocataire-qualite'],
    'matricule': row['allocataire-matricule'],
    'nom': row['allocataire-nom'],
    'prenom': row['allocataire-prenom'],
    'date_naissance': row['allocataire-date_naissance'],
    'courriel': row['allocataire-courriel'],
    'code_insee_commune_naissance': row['allocataire-code_insee_commune_naissance'],
    'commune_naissance': row['allocataire-commune_naissance'],
    'code_iso_pays_naissance': row['allocataire-code_iso_pays_naissance'],
    'pays_naissance': row['allocataire-pays_naissance']
  }
  filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v)}
  return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)


df_all_valid_no_duplicate['allocataire'] = df_all_valid_no_duplicate.apply(to_json_allocataire_without_null, axis=1)

In [None]:
## map adresse_allocataire json
def to_json_adresse_without_null(row):
  adresse_mapping = {
    'voie': row['adresse_allocataire-voie'],
    'code_postal': row['adresse_allocataire-code_postal'],
    'commune': row['adresse_allocataire-commune'],
    'code_insee': row['adresse_allocataire-code_insee'],
    'cplt_adresse': row['adresse_allocataire-cplt_adresse'],
  }

  filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v)}
  return json.dumps(filtered_address, ensure_ascii=False)


df_all_valid_no_duplicate['adresse_allocataire'] = df_all_valid_no_duplicate.apply(to_json_adresse_without_null, axis=1)

In [None]:
## drop null value
df_final = df_all_valid_no_duplicate.drop(columns=[
  'allocataire-qualite',
  'allocataire-matricule',
  'allocataire-nom',
  'allocataire-prenom',
  'allocataire-date_naissance',
  'allocataire-courriel',
  'allocataire-code_insee_commune_naissance',
  'allocataire-commune_naissance',
  'allocataire-code_iso_pays_naissance',
  'allocataire-pays_naissance',
  'adresse_allocataire-voie',
  'adresse_allocataire-code_postal',
  'adresse_allocataire-commune',
  'adresse_allocataire-code_insee',
  'adresse_allocataire-cplt_adresse',
])


In [None]:
# Add missing default column needed for target DB model
df_final['exercice_id'] = 3
df_final['uuid_doc'] = np.NaN
df_final[['zrr', 'qpv', 'a_valider', 'refuser']] = False

In [None]:
# output to CSV
df_final.to_csv(base_output_filepath, index=False)