# Génération des codes et formattage des données avant insertion en base

## Traitements


1. Chargement de la bdd existante et de la nouvelle généré à l'étape 1 (au même format)
2. Formattage de certaines colonnes & ajout d'autres pour la base de données ('zrr', 'qpv', 'a_valider', 'refuser')
3. Génération des codes manquants pour les nouvelles lignes de bénéficiaires
4. Output to csv

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import json
import numpy as np

load_dotenv()

new_db_export_filepath = os.environ['DB_EXPORT']
existing_db_filepath = os.environ['DB_EXISTING_BOURSIERS']
existing_codes_db_filepath = os.environ['DB_EXISTING_CODES']

final_merged_with_ids = os.environ['DB_MERGED_WITH_ID']
final_merged_without_ids = os.environ['DB_MERGED_WITHOUT_ID']

In [None]:
# Loading new data
column_type = {
  'id': 'string',
  'id_psp': 'string',
  'exercice_id': 'int',
  'qpv': 'boolean',
  'a_valider': 'boolean',
  'zrr': 'boolean',
  'refuser': 'boolean',
  'uuid_doc': 'string',
  'date_naissance': 'string',
  'updated_at': 'string',
  'created_at': 'string',
  'genre': 'string',
  'nom': 'string',
  'prenom': 'string',
  'situation': 'string',
}

df_new_db = pd.read_csv(new_db_export_filepath, sep=',', dtype=column_type)
df_existing_db = pd.read_csv(existing_db_filepath, sep=',', dtype=column_type)

In [None]:
# Remove manually created boursiers
df_existing_db = df_existing_db[df_existing_db['a_valider'] != True]

# Need to reset index because the dataframe has been filtered, and a merge is being done later on when unwrapping json
df_existing_db.reset_index(drop=True, inplace=True)

In [None]:
# Convert to date for comparison, timestamp is added back later down the line
df_new_db['date_naissance'] = pd.to_datetime(df_new_db['date_naissance']).dt.date

In [None]:
# JSON unwrap allocataire for newly exported from previous ste[
df_json_allocataire_new = pd.json_normalize(df_new_db['allocataire'].apply(json.loads))
df_json_allocataire_new = df_json_allocataire_new.add_prefix('allocataire-')

df_new_db_unwrapped_alloc = pd.merge(df_new_db, df_json_allocataire_new, left_index=True, right_index=True)
df_new_db_unwrapped_alloc = df_new_db_unwrapped_alloc.drop(columns=['allocataire'])

In [None]:
# JSON unwrap allocataire for existing
df_json_allocataire_existing = pd.json_normalize(df_existing_db['allocataire'].apply(json.loads))
df_json_allocataire_existing = df_json_allocataire_existing.add_prefix('allocataire-')

# Very important to keep this re-indexing
# df_existing_db.index = pd.RangeIndex(start=0, stop=len(df_existing_db), step=1)
df_existing_db_unwrapped_alloc = pd.merge(df_existing_db, df_json_allocataire_existing, left_index=True, right_index=True)
df_existing_db_unwrapped_alloc = df_existing_db_unwrapped_alloc.drop(columns=['allocataire'])

In [None]:
# Data casting and formating
df_new_db_unwrapped_alloc['nom'] = df_new_db_unwrapped_alloc['nom'].str.upper()
df_new_db_unwrapped_alloc['prenom'] = df_new_db_unwrapped_alloc['prenom'].str.upper()

mask_matricule_not_null = df_new_db_unwrapped_alloc['allocataire-matricule'].notna()
df_new_db_unwrapped_alloc['allocataire-matricule'] = df_new_db_unwrapped_alloc.loc[mask_matricule_not_null, 'allocataire-matricule'].astype(str)

# replace blank string with nan
df_new_db_unwrapped_alloc = df_new_db_unwrapped_alloc.replace(r'', np.NaN)

In [None]:
# Replace numerical code_iso_pays_naissance values to code iso 3166
# Country code mapping
country_codes = {
    "FRANCE": "FR",
    "ITALIE": "IT",
    "ALGERIE": "DZ",
    "ESPAGNE": "ES",
    "MAROC": "MA",
    "UKRAINE": "UA",
    "PORTUGAL": "PT",
    "MADAGASCAR": "MG",
    "TUNISIE": "TN",
    "RUSSIE": "RU",
    "SENEGAL": "SN",
    "SYRIE": "SY",
    "COMORES": "KM",
    "COTE D'IVOIRE": "CI",
    "GUERNESEY": "GG",
    "CONGO (REPUBLIQUE DEMOCRATIQUE)": "CD",
    "CAMEROUN": "CM",
    "HAITI": "HT",
    "ROUMANIE": "RO",
    "TURQUIE": "TR",
    "BELGIQUE": "BE",
    "ARMENIE": "AM",
    "CONGO": "CG",
    "ALLEMAGNE": "DE",
    "GUINEE": "GN",
    "SRI LANKA": "LK",
    "CHINE": "CN",
    "ETATS-UNIS": "US",
    "ALBANIE": "AL",
    "POLOGNE": "PL",
    "LIBAN": "LB",
    "MOLDAVIE": "MD",
    "AFGHANISTAN": "AF",
    "VIET NAM": "VN",
    "MALI": "ML",
    "BRESIL": "BR",
    "GABON": "GA",
    "GEORGIE": "GE",
    "GOA": "IN",
    "EGYPTE": "EG",
    "IRAQ": "IQ",
    "KOSOVO": "XK",
    "BENIN": "BJ",
    "PAKISTAN": "PK",
    "ARUBA": "AW",
    "BANGLADESH": "BD",
    "MONACO": "MC",
    "TOGO": "TG",
    "SUISSE": "CH",
    "BULGARIE": "BG",
    "ANGOLA": "AO",
    "COLOMBIE": "CO",
    "CENTRAFRICAINE (REPUBLIQUE)": "CF",
    "GRECE": "GR",
    "CANADA": "CA",
    "ETHIOPIE": "ET",
    "NIGERIA": "NG",
    "TCHAD": "TD",
    "MAURICE": "MU",
    "IRAN": "IR",
    "SERBIE": "RS",
    "MEXIQUE": "MX",
    "BURKINA": "BF",
    "MAURITANIE": "MR",
    "VENEZUELA": "VE",
    "SOUDAN": "SD",
    "THAILANDE": "TH",
    "RWANDA": "RW",
    "DJIBOUTI": "DJ",
    "LUXEMBOURG": "LU",
    "PEROU": "PE",
    "ARABIE SAOUDITE": "SA",
    "HONGRIE": "HU",
    "AZERBAIDJAN": "AZ",
    "TCHEQUIE": "CZ",
    "IRLANDE, ou EIRE": "IE",
    "CHILI": "CL",
    "ANDORRE": "AD",
    "DOMINICAINE (REPUBLIQUE)": "DO",
    "JAPON": "JP",
    "CAMBODGE": "KH",
    "MONGOLIE": "MN",
    "KAZAKHSTAN": "KZ",
    "LIBYE": "LY",
    "PHILIPPINES": "PH",
    "ARGENTINE": "AR",
    "CAP-VERT": "CV",
    "BOSNIE-HERZEGOVINE": "BA",
    "ISRAEL": "IL",
    "EMIRATS ARABES UNIS": "AE",
    "SOMALIE": "SO",
    "AFRIQUE DU SUD": "ZA",
    "BIELORUSSIE": "BY",
    "EQUATEUR": "EC",
    "BURUNDI": "BI",
    "BOLIVIE": "BO",
    "COREE (REPUBLIQUE DE)": "KR",
    "INDONESIE": "ID",
    "SLOVAQUIE": "SK",
    "GUINEE-BISSAU": "GW",
    "YEMEN": "YE",
    "SURINAME": "SR",
    "AUTRICHE": "AT",
    "NIGER": "NE",
    "GUATEMALA": "GT",
    "ERYTHREE": "ER",
    "LETTONIE": "LV",
    "KENYA": "KE",
    "CHRISTMAS (ILE)": "CX",
    "SVALBARD et ILE JAN MAYEN": "SJ",
    "GHANA": "GH",
    "LITUANIE": "LT",
    "SUEDE": "SE",
    "MACEDOINE DU NORD": "MK",
    "PALESTINE (Etat de)": "PS",
    "NEPAL": "NP",
    "FINLANDE": "FI",
    "CUBA": "CU",
    "JORDANIE": "JO",
    "MALAISIE": "MY",
    "LAOS": "LA",
    "SAINTE-LUCIE": "LC",
    "TAIWAN": "TW",
    "CHYPRE": "CY",
    "OUZBEKISTAN": "UZ",
    "PARAGUAY": "PY",
    "ROYAUME-UNI": "GB",
    "NICARAGUA": "NI",
    "COOK (ILES)": "CK",
    "GUYANA": "GY",
    "FEROE (ILES)": "FO",
    "MONTENEGRO": "ME",
    "COSTA RICA": "CR",
    "SINGAPOUR": "SG",
    "TADJIKISTAN": "TJ",
    "KIRGHIZISTAN": "KG",
    "EL SALVADOR": "SV",
    "GAMBIE": "GM",
    "KOWEIT": "KW",
    "URUGUAY": "UY",
    "CROATIE": "HR",
    "HONDURAS": "HN",
    "SIERRA LEONE": "SL",
    "DOMINIQUE": "DM",
    "ESTONIE": "EE",
    "OUGANDA": "UG",
    "QATAR": "QA",
    "ACORES, MADERE": "PT",
    "GUINEE EQUATORIALE": "GQ",
    "SAHARA OCCIDENTAL": "EH",
    "MOZAMBIQUE": "MZ",
    "ZIMBABWE": "ZW",
    "BAHREIN": "BH",
    "OMAN": "OM",
    "TANZANIE": "TZ",
    "SEYCHELLES": "SC",
    "SLOVENIE": "SI",
    "TURKMENISTAN": "TM",
    "CANARIES (ILES)": "ES",
    "NAMIBIE": "NA",
    "LIBERIA": "LR",
    "ZAMBIE": "ZM",
    "JAMAIQUE": "JM",
    "MALTE": "MT",
    "BELIZE": "BZ",
    "ANTILLES NEERLANDAISES": "AN",
    "COREE": "KP",
    "PANAMA": "PA",
    "VANUATU": "VU",
    "BIRMANIE": "MM",
    "FIDJI": "FJ",
    "SAO TOME-ET-PRINCIPE": "ST",
    "TRINITE-ET-TOBAGO": "TT",
    "BARBADE": "BB",
    "SAO TOME": "ST",
    "SAINT-MARIN": "SM",
    "LIECHTENSTEIN": "LI",
    "BRUNEI": "BN",
    "INDE": "IN",
    "CAIMANES (ILES)": "KY",
    "TERR. DES ETATS-UNIS D'AMERIQUE EN OCEANIE": "UM",
    "HONG-KONG": "HK",
    "PAYS-BAS": "NL"
}

df_new_db_unwrapped_alloc['allocataire-code_iso_pays_naissance'] = df_new_db_unwrapped_alloc['allocataire-pays_naissance'].replace(country_codes)

df_existing_db_unwrapped_alloc['allocataire-code_iso_pays_naissance'] = df_existing_db_unwrapped_alloc['allocataire-pays_naissance'].replace(country_codes)

In [None]:
# map to json values for target DB model 
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': row['allocataire-qualite'],
        'matricule': row['allocataire-matricule'],
        'nom': row['allocataire-nom'],
        'prenom': row['allocataire-prenom'],
        'date_naissance': row['allocataire-date_naissance'],
        'courriel': row['allocataire-courriel'],
        'code_insee_commune_naissance': row['allocataire-code_insee_commune_naissance'],
        'commune_naissance': row['allocataire-commune_naissance'],
        'code_iso_pays_naissance': row['allocataire-code_iso_pays_naissance'],
        'pays_naissance': row['allocataire-pays_naissance']
    }
    filtered_nan_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_nan_allocataire, ensure_ascii=False)

df_new_db_unwrapped_alloc['allocataire'] = df_new_db_unwrapped_alloc.apply(to_json_allocataire_without_null, axis=1)
df_existing_db_unwrapped_alloc['allocataire'] = df_existing_db_unwrapped_alloc.apply(to_json_allocataire_without_null, axis=1)

In [None]:
# Withdraw boursiers without INE and add them back later
df_boursiers_without_ine = df_new_db_unwrapped_alloc[df_new_db_unwrapped_alloc['allocataire-matricule'].isna()]
df_new_db_unwrapped_alloc = df_new_db_unwrapped_alloc[df_new_db_unwrapped_alloc['allocataire-matricule'].notna()]

In [None]:
df_existing_db_unwrapped_alloc = df_existing_db_unwrapped_alloc[df_existing_db_unwrapped_alloc['allocataire-matricule'].notna()]

# Exclude boursiers without INE from the merge
df_final = pd.merge(
  df_new_db_unwrapped_alloc, 
  df_existing_db_unwrapped_alloc[['allocataire-matricule', 'id', 'id_psp', 'created_at']],
  how='left', 
  on=['allocataire-matricule'], 
  suffixes=(None, '_right')
)

In [None]:
# Include back boursiers without INE after the merge
df_final = pd.concat([
  df_final, 
  df_boursiers_without_ine
], ignore_index=True)

In [None]:
df_final = df_final.drop(columns=[
  'allocataire-qualite',
  'allocataire-matricule',
  'allocataire-nom',
  'allocataire-prenom',
  'allocataire-date_naissance',
  'allocataire-courriel',
  'allocataire-code_insee_commune_naissance',
  'allocataire-commune_naissance',
  'allocataire-code_iso_pays_naissance',
  'allocataire-pays_naissance',
])

In [None]:
# Add back timestamp and the 4 hours to be iso with the existing database
df_final['date_naissance'] = pd.to_datetime(df_final['date_naissance']).dt.floor('D') + pd.DateOffset(hours=4)

In [None]:
# load all existing codes
df_existing_codes = pd.read_csv(existing_codes_db_filepath, encoding='utf-8', dtype=str)

df_no_code = df_final[df_final['id_psp'].isna()]
df_with_code = df_final[df_final['id_psp'].notna()]


In [None]:
len(df_existing_codes)

In [None]:
# generate new code ensuring no duplicates with existings
import random
import string
import datetime

current_date = datetime.datetime.now()
current_year = str(current_date.year)[-2:]

def get_characters_set(size = 4):
    return ''.join(random.choices([c for c in string.ascii_uppercase if c not in 'OI'], k=size))
    
def generate_code():
    return f"{current_year}-{get_characters_set(4)}-{get_characters_set(4)}"

# init set of codes with existing
unique_codes = set(df_existing_codes['id_psp'])

# init current_code count
current_codes_count = len(unique_codes)

while len(unique_codes) < current_codes_count + len(df_no_code):
    code = generate_code()
    unique_codes.add(code)

# only retrieve newly created codes
new_codes = list(unique_codes.difference(df_existing_codes['id_psp']))
df_new_codes = pd.DataFrame({ 'id_psp': new_codes })

print(f"{len(df_new_codes)} generated codes")

In [None]:
df_no_code = df_no_code.reset_index(drop=True).combine_first(df_new_codes.reset_index(drop=True))

In [None]:
df_with_code.loc[:, 'id'] = df_with_code['id'].astype(int)

In [None]:
import pytz
import datetime

# add created_at for new rows
# using time zone for created_at to be iso existings
tz = pytz.timezone('Europe/Paris')
now = datetime.datetime.now()
now_tz = tz.localize(now)

In [None]:
df_with_code.loc[:, 'updated_at'] = now_tz
df_with_code.to_csv(final_merged_with_ids, index=False)

In [None]:
df_no_code[['created_at', 'updated_at']] = now_tz
df_no_code.drop(columns=['id']).to_csv(final_merged_without_ids, index=False)