In [1]:
import pandas as pd
import os

In [2]:
def load_data(noms: list, path: str, colstokeep: dict = None) -> None:
    for nom in noms:
        chemin_fichier = os.path.join(path, f"{nom}.csv")
        if os.path.exists(chemin_fichier):
            if colstokeep is not None and nom in colstokeep:
                df = pd.read_csv(chemin_fichier, usecols=colstokeep[nom], low_memory=False)
            else:
                df = pd.read_csv(chemin_fichier, low_memory=False)
            globals()[nom] = df
        else:
            print(f"Le fichier {nom+'.csv'} n'existe pas.")

In [3]:
files = [
    "sdc","synthetise",
    "plantation_perenne_synthetise","plantation_perenne_realise", "noeuds_realise","noeuds_synthetise", "connection_synthetise","parcelle","zone",
    "composant_culture","variete","espece",
    "noeuds_synthetise_restructure","plantation_perenne_synthetise_restructure"
    ]

colstokeep_dict = {
    "sdc": ['id','code','nom','campagne','code_dephy','filiere','type_production','type_agriculture','part_sau_domaine','reseaux_ir','reseaux_it'],
    "synthetise": ['id', 'nom', 'campagnes', 'sdc_id'],

    "plantation_perenne_synthetise" : ['id','synthetise_id'],
    "plantation_perenne_realise" : ['id','culture_id','zone_id'],
    "noeuds_synthetise" : ['id','synthetise_id'],
    "noeuds_realise" : ['id','culture_id','zone_id'],
    "parcelle": ['id','sdc_id'],
    "zone": ['id','parcelle_id'],

    "espece": ['id','code_espece_botanique','libelle_espece_botanique','typocan_espece','typocan_espece_maraich'],
    "variete" : ['id','denomination']
    }

In [4]:
path = '/home/administrateur/Bureau/Datagrosyst/data_entrepot_outils'
load_data(files, path, colstokeep_dict)

del(colstokeep_dict, files)

In [5]:
plantation_perenne_synthetise = plantation_perenne_synthetise.merge(plantation_perenne_synthetise_restructure, on='id', how='left')
del(plantation_perenne_synthetise_restructure)

noeuds_synthetise = noeuds_synthetise.merge(noeuds_synthetise_restructure, on='id', how='left')
del(noeuds_synthetise_restructure)

composant_culture = composant_culture.merge(espece.rename(columns={'id': 'espece_id'}), on='espece_id', how='left').merge(variete.rename(columns={'id': 'variete_id'}), on='variete_id', how='left')
del(espece, variete)

In [6]:
noeuds_realise = noeuds_realise.merge(zone.rename(columns={'id': 'zone_id'}), on='zone_id', how='left').merge(parcelle.rename(columns={'id': 'parcelle_id'}), on='parcelle_id', how='left')
plantation_perenne_realise = plantation_perenne_realise.merge(zone.rename(columns={'id': 'zone_id'}), on='zone_id', how='left').merge(parcelle.rename(columns={'id': 'parcelle_id'}), on='parcelle_id', how='left')
del(zone, parcelle)

all_nodes= pd.concat([
    noeuds_realise[['id','culture_id','sdc_id']].assign(type='assole'),
    noeuds_synthetise[['id','culture_id','synthetise_id']].assign(type='assole'),
    plantation_perenne_realise[['id','culture_id','sdc_id']].assign(type='peren'),
    plantation_perenne_synthetise[['id','culture_id','synthetise_id']].assign(type='peren')
])
del(noeuds_realise, noeuds_synthetise, plantation_perenne_realise, plantation_perenne_synthetise)

In [7]:
# Fonction pour nettoyer les listes de 'nan'
def clean_list(lst):
    cleaned = [x for x in lst if str(x) != 'nan']
    if len(cleaned) == 1:
        cleaned = cleaned[0]
    return cleaned if cleaned else None

def get_unique_list(row, column, filiere_col='filiere'):
    if row[filiere_col] == 'ARBORICULTURE':
        return clean_list(row['filtered_culture_util'][column].unique().tolist())
    else:
        return clean_list(row['filtered_culture'][column].unique().tolist())

def analyse_groupby(df, groupby_col, lib_sp_arbo):
    df_filtered = df[df[groupby_col].notna()]

    result = df_filtered.groupby(groupby_col).apply(
        lambda cgrp: pd.Series({
            'filiere': cgrp['filiere'].iloc[0],
            'code_dephy': cgrp['code_dephy'].iloc[0],
            'filtered_culture': composant_culture[composant_culture['culture_id'].isin(cgrp['culture_id'])],
            'filtered_culture_util': composant_culture[
                (composant_culture['culture_id'].isin(cgrp['culture_id'])) &
                (composant_culture['libelle_espece_botanique'].isin(lib_sp_arbo))
            ],
        }),
        include_groups=False
    )

    result['unique_species'] = result['filtered_culture'].apply(
        lambda x: clean_list(x['libelle_espece_botanique'].unique().tolist())
    )
    result['unique_variete'] = result['filtered_culture'].apply(
        lambda x: clean_list(x['denomination'].unique().tolist())
    )
    result['unique_species_util'] = result.apply(lambda row: get_unique_list(row, 'libelle_espece_botanique'), axis=1)
    result['unique_variete_util'] = result.apply(lambda row: get_unique_list(row, 'denomination'), axis=1)
    result['size_unique_sp'] = result['unique_species_util'].apply(
        lambda x: len(x) if isinstance(x, list) else 1 if x is not None else 0
    )
    result['size_unique_var'] = result['unique_variete_util'].apply(
        lambda x: len(x) if isinstance(x, list) else 1 if x is not None else 0
    )

    result = result.drop(columns=['filtered_culture', 'filtered_culture_util'])

    return result

# Utilisation de la fonction
# code_sp_arbo = ["G21", "G20", "G07", "E01", "G28", "E85", "F84", "F86", "E67"]
lib_sp_arbo = ["Pommier", "Poirier", "Pêcher", "Abricotier", "Prunier", "Clémentinier", "Noyer", "Olivier", "Cerisier"] # , "Ananas", "Bananier plantain", "Manguier"]

all_nodes['group_id'] = all_nodes['sdc_id'].fillna(all_nodes['synthetise_id'])
all_nodes = all_nodes.merge(synthetise[['id', 'sdc_id']].rename(columns={'id': 'synthetise_id', 'sdc_id': 'sdc_id_fromsynth'}), on='synthetise_id', how='left')
all_nodes['sdc_id'] = all_nodes['sdc_id'].fillna(all_nodes['sdc_id_fromsynth'])
all_nodes = all_nodes.merge(sdc[['id', 'code_dephy', 'filiere']].rename(columns={'id': 'sdc_id'}), on='sdc_id', how='left').drop(columns=['sdc_id_fromsynth'], errors='ignore')

test = analyse_groupby(all_nodes.loc[all_nodes['filiere'].isin(['ARBORICULTURE','VITICULTURE'])], 'group_id', lib_sp_arbo)

In [8]:
test.to_csv(path + '/TEST_species_variety.csv')