In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from tqdm import tqdm
import random

In [2]:
ENTREPOT_PATH = '/home/administrateur/Bureau/Datagrosyst/data_entrepot_outils/'
donnees = {}

def import_df(df_name, path_data, sep, index_col=None):
    donnees[df_name] = pd.read_csv(path_data+df_name+'.csv', sep = sep, index_col=index_col, low_memory=False).replace({'\r\n': '\n'}, regex=True)

def import_dfs(df_names, path_data, sep = ',', index_col=None, verbose=False):
    for df_name in tqdm(df_names) : 
        if(verbose) :
            print(" - ", df_name)
        import_df(df_name, path_data, sep, index_col=index_col)

tables = ['synthetise', 'synthetise_synthetise_performance', 'parcelle', 'sdc']

# import des données du magasin
import_dfs(tables, ENTREPOT_PATH, sep = ',', verbose=False)

100%|██████████| 4/4 [00:07<00:00,  1.87s/it]


In [3]:
list_sdc_id = ['fr.inra.agrosyst.api.entities.GrowingSystem_1ba3ab09-f3c4-4989-96f4-17c5f59d0463', # fr.inra.agrosyst.api.entities.practiced.PracticedSystem_01c64002-fb89-4035-8e6d-98d40a6bf689
               'fr.inra.agrosyst.api.entities.GrowingSystem_c57a89fe-9d1a-4a53-ae1a-613c822d7fa0', # realise_retenu
               'fr.inra.agrosyst.api.entities.GrowingSystem_bfd3ba79-d77c-4708-b7a4-021a9d354137', # fr.inra.agrosyst.api.entities.practiced.PracticedSystem_82a090cf-ddee-4443-86ed-6bd4643806a7
               'fr.inra.agrosyst.api.entities.GrowingSystem_9ba92d21-f43b-4d8e-9f44-2dad120f7f5d' # fr.inra.agrosyst.api.entities.practiced.PracticedSystem_6012145d-74bb-40a9-b37d-d33c817773a7
               ]

list_synth_id = [
    'fr.inra.agrosyst.api.entities.practiced.PracticedSystem_01c64002-fb89-4035-8e6d-98d40a6bf689',
    'fr.inra.agrosyst.api.entities.practiced.PracticedSystem_82a090cf-ddee-4443-86ed-6bd4643806a7',
    'fr.inra.agrosyst.api.entities.practiced.PracticedSystem_6012145d-74bb-40a9-b37d-d33c817773a7'
]

In [4]:
list_tx_comp = ['ift_cible_non_mil_tx_comp','co_tot_std_mil_tx_comp','co_decomposees_std_mil_tx_comp','cm_std_mil_tx_comp','pb_std_mil_tx_comp']

synthetise = donnees['synthetise'][['id','valide','derniere_maj','sdc_id','campagnes']]
perf_synth = donnees['synthetise_synthetise_performance'][['synthetise_id']+list_tx_comp]
parcelle = donnees['parcelle'][['id','sdc_id']] # attention ne prends pas en compte les parcelles non rattachées

synthetise = synthetise.loc[synthetise['sdc_id'].isin(list_sdc_id)]
parcelle = parcelle.loc[parcelle['sdc_id'].isin(list_sdc_id)]

perf_synth1 = perf_synth.loc[(perf_synth['synthetise_id'].isin(list_synth_id))]
perf_synth2 = perf_synth.loc[~(perf_synth['synthetise_id'].isin(list_synth_id))].sample(n=12, random_state=42)
perf_synth = pd.concat([perf_synth1,perf_synth2])

path = '/home/administrateur/Bureau/Datagrosyst/catalogue_script_agrosyst/02_outils/tests/data/test_entite_unique_par_sdc_nettoyage/'
synthetise.to_csv(path + 'synthetise.csv', index=False)
perf_synth.to_csv(path + 'synthetise_synthetise_performance.csv', index=False)
parcelle.to_csv(path + 'parcelle.csv', index=False)

del(synthetise, perf_synth, parcelle)

In [5]:
synthetise = pd.read_csv(path + 'synthetise.csv')
perf_synth = pd.read_csv(path + 'synthetise_synthetise_performance.csv')
parcelle = pd.read_csv(path + 'parcelle.csv')

parcelle = parcelle[['sdc_id']]

In [6]:
perf_synth = perf_synth.copy()
perf_synth.loc[:, 'tx_compl'] = perf_synth[list_tx_comp].sum(axis=1)
perf_synth.drop(columns=list_tx_comp, inplace=True)

perf_synth = perf_synth.merge(synthetise.rename(columns={'id':'synthetise_id'}), on='synthetise_id', how='left')
perf_synth['calcul'] = 'synth'

real = pd.DataFrame(data = {"sdc_id" : parcelle['sdc_id'].unique(), "calcul" : 'real'})

df = pd.concat([perf_synth, real])

In [7]:
df['calcul'] = df['calcul'].apply(lambda x: 0 if x == 'synth' else 1)
df['tx_compl'] = df['tx_compl'].fillna(-float('inf'))
df['valide'] = df['valide'].fillna('f').apply(lambda x: 0 if x == 't' else 1)
df['derniere_maj'] = df['derniere_maj'].fillna('0001-01-01 00:00:00.000')
df['campagnes'] = df['campagnes'].astype(str).fillna('0000').apply(lambda x: abs(len(x) - 16))

In [8]:
# On priorise les valeurs selon plusieurs méthodes
df1=df.sort_values(
    by=[
        'calcul',       # On priorise les synthétisé
        'tx_compl',     # On priorise la somme des taux de complétion les plus hautes
        'campagnes',    # On priorise les triannuels
        'valide',       # On priorise les entités validées
        'derniere_maj'  # On fini par prioriser les entités dont la denrière maj est la plus récente
    ],
    ascending=[True, False, True, True, False]
)

# On groupe par sdc_id et on tague l'entité prioritaire
df1['est_prioritaire'] = df1.groupby('sdc_id').cumcount() == 0

df1 = df1.loc[df1['est_prioritaire'], ['sdc_id','synthetise_id']]
df1['synthetise_id'] = np.where(df1['synthetise_id'].isnull(), "realise_retenu", df1['synthetise_id'])
df1.rename(columns={'synthetise_id':'entite_retenue'}, inplace=True)

In [9]:
# # On priorise les valeurs selon la méthode de la CAN
# df.sort_values(
#     by=[
#         'calcul',       # On priorise les synthétisé
#         'derniere_maj'  # On fini par prioriser les entités dont la denrière maj est la plus récente
#     ],
#     ascending=[True, False],
#     inplace=True
# )

# df['est_prioritaire_CAN'] = df.groupby('sdc_id').cumcount() == 0

# df = df.loc[df['est_prioritaire_CAN'], ['sdc_id','synthetise_id']]
# df['synthetise_id'] = np.where(df['synthetise_id'].isnull(), "realise_retenu", df['synthetise_id'])
# df.rename(columns={'synthetise_id':'entite_retenue'}, inplace=True)

In [10]:
# test = df1.merge(df, on ='sdc_id', how='outer')

In [12]:
df1

Unnamed: 0,sdc_id,entite_retenue
1,fr.inra.agrosyst.api.entities.GrowingSystem_1b...,fr.inra.agrosyst.api.entities.practiced.Practi...
0,fr.inra.agrosyst.api.entities.GrowingSystem_bf...,fr.inra.agrosyst.api.entities.practiced.Practi...
2,fr.inra.agrosyst.api.entities.GrowingSystem_9b...,fr.inra.agrosyst.api.entities.practiced.Practi...
2,fr.inra.agrosyst.api.entities.GrowingSystem_c5...,realise_retenu
