In [1]:
import pandas as pd
import hvplot.pandas
import holoviews as hv
import panel as pn
import plotly.express as px
import numpy as np
from hvplot import hvPlot
import param
from src.data.make_dataset import DATA_PATH
from src.visualization.visualize import _get_upper_bar, LABELS
from src.visualization.panel_figures import benchmark
import random

pd.options.plotting.backend = "holoviews"

%load_ext autoreload
%autoreload 2

In [2]:
df = benchmark.get_df()

In [3]:
df.columns

Index(['Id', 'SIREN principal', 'Méthode BEGES (V4,V5)', 'Type de structure',
       'Type de collectivité', 'Mode de consolidation', 'Recalcul',
       'Comparaison avec le précédent bilan', 'nb_salaries_range',
       'nb_salaries_min', 'nb_salaries_max', 'nb_salaries_mean', 'naf5',
       'Secteur d'activité (NAF1)', 'naf2', 'naf3', 'naf4',
       'month_publication', 'Année de reporting', 'poste_emissions',
       'Émissions_totales', 'Émission_par_salarié', 'emissions_clipped',
       'Catégorie d'émissions', 'Poste d'émissions'],
      dtype='object')

## emissions_per_salarie seems too high for the entire company

Y-axis can split entries:
- inside a bilan (e.g. poste_emission, categorie_emission). Every single bilan (Id) can have data in multiple y-axis categories. We can simply groupby the y-axis category and aggregate
- between different bilans (e.g. NAF, type_structure, ...). The emissions for every single bilan (Id) should be summed together before being split into the category and analyzed (mean, median, std)

In [23]:
x = benchmark.filter_options(df, secteur_activite='all', type_structure_all=False, type_structure_options=['Entreprise', 'Établissement public'])

In [24]:
x.plot(kind='box', by=LABELS.type_structure, y=LABELS.emissions_par_salarie).opts(invert_axes=True, ylim=(0, 10))

For each bilan (Id), there can be between 0 and 22 rows in the base dataframe -> we need to sum them if we want a y-axis between different bilans

In [82]:
# it makes sense that the max is 22
df[LABELS.poste_emissions].nunique()

22

In [90]:
y = x.groupby('Id')[LABELS.emissions_par_salarie].count()
y.describe()

count    4788.000000
mean        6.174603
std         3.104813
min         0.000000
25%         4.000000
50%         5.000000
75%         8.000000
max        22.000000
Name: Émission_par_salarié, dtype: float64

Nombre de postes d'émissions remplis par Bilan

In [91]:
y.reset_index().groupby('Émission_par_salarié').nunique().plot(kind='bar')

These outliers with only empty data could be removed...

In [98]:
bilans_with_0_emissions = y[y==0].index

In [105]:
df[df.Id.isin(bilans_with_0_emissions)][['Id', 'SIREN principal', "Secteur d'activité (NAF1)", 'month_publication']].drop_duplicates()

Unnamed: 0,Id,SIREN principal,Secteur d'activité (NAF1),month_publication
52,9370dddf-b1cd-11ed-8fce-005056b7acd1,810299149,Autres activités de services,2015-04
147,93721ead-b1cd-11ed-8fce-005056b7acd1,264900390,Santé humaine et action sociale,2016-01
307,93756ce0-b1cd-11ed-8fce-005056b7acd1,188300032,Autres activités de services,2016-04
333,93754e9c-b1cd-11ed-8fce-005056b7acd1,489243881,Industrie manufacturière,2016-05
5091,93c6de77-b1cd-11ed-8fce-005056b7acd1,807570148,Information et communication,2023-01
5369,a02088c4-1688-45f5-bf1e-77fe1c1b337d,849252739,"Activités spécialisées, scientifiques et techn...",2023-06


### v1 - single groupby yaxis

Here, for each type of structure we do the average over all entries, even among a single bilan -> WRONG

In [107]:
x.groupby(LABELS.type_structure)[LABELS.emissions_par_salarie].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Type de structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Entreprise,22645.0,28.434484,521.164464,-92.152101,0.073382,0.362416,1.567653,38076.045364
Établissement public,6919.0,2.198346,30.673948,3e-06,0.065862,0.217478,0.811579,1735.435624


### v2 - groupby yaxis + Id, then sum, then groupby yaxis

Why is the min now 0 ? Because some bilans have 0 entry

The max is always > than the previous max though.

In [167]:
y = x.groupby([LABELS.type_structure, 'Id'])[LABELS.emissions_par_salarie].sum()
y1 = y.groupby(LABELS.type_structure).describe()
y1

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Type de structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Entreprise,3642.0,176.798158,1575.133282,0.0,1.843979,5.337794,19.239493,49937.0
Établissement public,1146.0,13.27256,109.167332,0.0,1.373341,2.283544,5.968898,3467.667245


In [106]:
x.Id.nunique()

4788

The total number of entries in y (before grouping again) seems right

In [88]:
y

Type de structure     Id                                  
Entreprise            00609c55-d063-4661-9c22-b161bb72bc99      4.593502
                      0165ace2-634c-458d-9b66-52578a59ab86    220.923077
                      029e8c5d-df47-4eba-a5c9-71c1c4e7cbdc      4.664443
                      0369acaf-e747-4923-9dbd-26ab3bcdf10b      8.587826
                      041106ad-34f9-4b2a-a553-c2a2593fcea0      0.710981
                                                                 ...    
Établissement public  b9d63e10-3d9b-4e4e-a7fe-d2382115327b      2.882943
                      bee0629e-eefd-4be8-a00b-95b139e230b1     19.452969
                      c27e6eca-8eb9-4aab-b800-7bc7624c2043     12.017726
                      dcd0aeaa-648e-4cba-81d0-4be7504850b3    916.000000
                      dd2b609a-2c1f-4ab5-9b73-1289fda6f503     10.156052
Name: Émission_par_salarié, Length: 4788, dtype: float64

In [24]:
df.head()

Unnamed: 0,Id,SIREN principal,"Méthode BEGES (V4,V5)",Type de structure,Type de collectivité,Mode de consolidation,Recalcul,Comparaison avec le précédent bilan,nb_salaries_range,nb_salaries_min,...,naf3,naf4,month_publication,Année de reporting,poste_emissions,Émissions_totales,Émission_par_salarié,emissions_clipped,Catégorie d'émissions,Poste d'émissions
0,93708e2c-b1cd-11ed-8fce-005056b7acd1,775709702,v4,Entreprise,,Opérationnel,Aucun recalcul pour le premier bilan.\n,,5000-9999,5000.0,...,Assurance,Autres assurances,2015-03,2011,1.1,628.0,0.083739,628.0,1 - Émissions directes,1.1 - Émissions directes des sources fixes de ...
1,9370955a-b1cd-11ed-8fce-005056b7acd1,85480069,v4,Entreprise,,Opérationnel,,,500-999,500.0,...,Commerce de gros de biens domestiques,Commerce de gros d'habillement et de chaussures,2015-03,2011,1.1,786.19,1.048953,786.19,1 - Émissions directes,1.1 - Émissions directes des sources fixes de ...
2,9370988a-b1cd-11ed-8fce-005056b7acd1,180600041,v4,Établissement public,,Opérationnel,,,250-499,250.0,...,Autre action sociale sans hébergement,Autre action sociale sans hébergement n.c.a.,2015-03,2011,1.1,799.0,2.133511,799.0,1 - Émissions directes,1.1 - Émissions directes des sources fixes de ...
3,93709cb8-b1cd-11ed-8fce-005056b7acd1,242900769,v4,Collectivité territoriale (dont EPCI),Communes,Opérationnel,,,100-199,100.0,...,"Administration générale, économique et sociale",Administration publique générale,2015-03,2011,1.1,494.37,3.306823,494.37,1 - Émissions directes,1.1 - Émissions directes des sources fixes de ...
4,93709e81-b1cd-11ed-8fce-005056b7acd1,246100663,v4,Collectivité territoriale (dont EPCI),Communes,Opérationnel,,,500-999,500.0,...,"Administration générale, économique et sociale",Administration publique générale,2015-03,2012,1.1,1746.44,2.33014,1746.44,1 - Émissions directes,1.1 - Émissions directes des sources fixes de ...


#### very strange

For some reason, grouping by an yaxis between bilans (i.e. with each Id belonging to only 1 yaxis category), things seem to go fine: the Ids are not duplicated with empty values, the total number stays the same

In [137]:
x.groupby([LABELS.secteur_activite, 'Id'], dropna=False)[LABELS.emissions_par_salarie].count()

Secteur d'activité (NAF1)                           Id                                  
Activités de services administratifs et de soutien  09a6be9a-6bc7-4c55-80d1-a446fdd276de    13
                                                    0f76d762-7c24-4cdd-b95e-aa1d81190f9e     8
                                                    13a568f6-80bc-4885-be55-c58943fe3f37    12
                                                    284bb234-abf1-47d0-a33e-b04b6d463513    11
                                                    286a9a61-8cfc-4994-8e85-8941c666155b     9
                                                                                            ..
Transports et entreposage                           b502e68d-1970-4078-837f-a2138656b557     2
                                                    ce3f6e6d-9a66-4807-b2fa-c67291bf107a     6
                                                    e8eed765-808c-4714-82ba-2c7f224f8020     3
NaN                                                 9374

In [133]:
x.groupby([LABELS.type_structure, 'Id'], dropna=False)[LABELS.emissions_par_salarie].count()

Type de structure     Id                                  
Entreprise            00609c55-d063-4661-9c22-b161bb72bc99    10
                      0165ace2-634c-458d-9b66-52578a59ab86    15
                      029e8c5d-df47-4eba-a5c9-71c1c4e7cbdc    11
                      0369acaf-e747-4923-9dbd-26ab3bcdf10b    12
                      041106ad-34f9-4b2a-a553-c2a2593fcea0     6
                                                              ..
Établissement public  b9d63e10-3d9b-4e4e-a7fe-d2382115327b     5
                      bee0629e-eefd-4be8-a00b-95b139e230b1     5
                      c27e6eca-8eb9-4aab-b800-7bc7624c2043    13
                      dcd0aeaa-648e-4cba-81d0-4be7504850b3    11
                      dd2b609a-2c1f-4ab5-9b73-1289fda6f503    10
Name: Émission_par_salarié, Length: 4788, dtype: int64

But when grouping by an yaxis independent of Id (i.e. with Ids belonging to different yaxis categories), then each Id seem to get an entry in ALL yaxis categories. Maybe this is what we want?

In [136]:
x.groupby([LABELS.category_emissions, 'Id'], dropna=False)[LABELS.emissions_par_salarie].count()

Catégorie d'émissions            Id                                  
1 - Émissions directes           00609c55-d063-4661-9c22-b161bb72bc99    2
                                 015b4696-b05d-45ce-bfdd-2564326412c5    3
                                 0165ace2-634c-458d-9b66-52578a59ab86    3
                                 029e8c5d-df47-4eba-a5c9-71c1c4e7cbdc    3
                                 0369acaf-e747-4923-9dbd-26ab3bcdf10b    2
                                                                        ..
6 - Autres émissions indirectes  fc81a091-02e8-4bb5-9e1c-ef9c8b6927db    0
                                 fd0efaba-5989-46c1-a8f4-a7d0b5a980ed    1
                                 fd625530-0a76-4e91-b89f-4560a427ff10    1
                                 fea5d04b-f8ef-47ed-8d9b-f5e169b405e5    1
                                 ff3c2213-35b7-49d9-afcb-11231c808327    0
Name: Émission_par_salarié, Length: 28728, dtype: int64

In [139]:
x[LABELS.category_emissions].nunique() * x['Id'].nunique()

28728

In [131]:
x.groupby([LABELS.poste_emissions, 'Id'])[LABELS.emissions_par_salarie].count()

Poste d'émissions                                         Id                                  
1.1 - Émissions directes des sources fixes de combustion  00609c55-d063-4661-9c22-b161bb72bc99    1
                                                          015b4696-b05d-45ce-bfdd-2564326412c5    1
                                                          0165ace2-634c-458d-9b66-52578a59ab86    1
                                                          029e8c5d-df47-4eba-a5c9-71c1c4e7cbdc    1
                                                          0369acaf-e747-4923-9dbd-26ab3bcdf10b    1
                                                                                                 ..
6.1 - Autres émissions directes                           fc81a091-02e8-4bb5-9e1c-ef9c8b6927db    0
                                                          fd0efaba-5989-46c1-a8f4-a7d0b5a980ed    1
                                                          fd625530-0a76-4e91-b89f-4560a427ff10    1
     

In [110]:
x[LABELS.poste_emissions].nunique() * x['Id'].nunique()

105336

### v3 - groupby Id then sum, then groupby yaxis

Seems to be the same than v2, which is reassuring

In [169]:
y = x.groupby('Id')[[LABELS.type_structure, LABELS.emissions_par_salarie]].agg(
    emissions=pd.NamedAgg(column=LABELS.emissions_par_salarie, aggfunc='sum'),
    type_structure_nunique=pd.NamedAgg(column=LABELS.type_structure, aggfunc='nunique'),
    type_structure=pd.NamedAgg(column=LABELS.type_structure, aggfunc=lambda x: '-'.join(x.unique())),
)
y

Unnamed: 0_level_0,emissions,type_structure_nunique,type_structure
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00609c55-d063-4661-9c22-b161bb72bc99,4.593502,1,Entreprise
015b4696-b05d-45ce-bfdd-2564326412c5,9.967198,1,Établissement public
0165ace2-634c-458d-9b66-52578a59ab86,220.923077,1,Entreprise
029e8c5d-df47-4eba-a5c9-71c1c4e7cbdc,4.664443,1,Entreprise
0369acaf-e747-4923-9dbd-26ab3bcdf10b,8.587826,1,Entreprise
...,...,...,...
fc81a091-02e8-4bb5-9e1c-ef9c8b6927db,102.425415,1,Entreprise
fd0efaba-5989-46c1-a8f4-a7d0b5a980ed,3.974650,1,Entreprise
fd625530-0a76-4e91-b89f-4560a427ff10,9.413059,1,Entreprise
fea5d04b-f8ef-47ed-8d9b-f5e169b405e5,2.577181,1,Entreprise


In [170]:
y2 = y.groupby('type_structure')['emissions'].describe()
y2

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type_structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Entreprise,3642.0,176.798158,1575.133282,0.0,1.843979,5.337794,19.239493,49937.0
Établissement public,1146.0,13.27256,109.167332,0.0,1.373341,2.283544,5.968898,3467.667245


In [171]:
y2 == y1

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type_structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Entreprise,True,True,True,True,True,True,True,True
Établissement public,True,True,True,True,True,True,True,True


## test it works

In [21]:
benchmark.get_benchmark_dashboard()

In [7]:
%debug

> [0;32m/Users/alexandre/dev/carbon-trackr/benchmark-footprints/venv/lib/python3.11/site-packages/holoviews/core/element.py[0m(95)[0;36m__iter__[0;34m()[0m
[0;32m     93 [0;31m    [0;32mdef[0m [0m__iter__[0m[0;34m([0m[0mself[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     94 [0;31m        [0;34m"Disable iterator interface."[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 95 [0;31m        [0;32mraise[0m [0mNotImplementedError[0m[0;34m([0m[0;34m'Iteration on Elements is not supported.'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     96 [0;31m[0;34m[0m[0m
[0m[0;32m     97 [0;31m    [0;32mdef[0m [0mclosest[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mcoords[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/Users/alexandre/dev/carbon-trackr/benchmark-footprints/venv/lib/python3.11/site-packages/pandas/core/roperator.py[0m(19)[0;36mrmul[0;34m()[0m
[0;32m     17 [0;31m[0;34m[0m[0m
[0m[0;32m     18 [0;31m[0;32mdef[0m [0mrmul[0m[0;34m([0m[0mleft[0m[0;34m,[0m [0mright[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 19 [0;31m    [0;32mreturn[0m [0mright[0m [0;34m*[0m [0mleft[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     20 [0;31m[0;34m[0m[0m
[0m[0;32m     21 [0;31m[0;34m[0m[0m
[0m


ipdb>  left


array([:Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),
       :Curve   [index]   (Émission_par_salarié),


ipdb>  right


:BoxWhisker   [Poste d'émissions]   (Émission_par_salarié)


ipdb>  c


In [176]:
len(df)

124058

## Find filter test fixtures values

In [1]:
import sys
sys.path.append('../')
from tests.test_benchmark import test_filter_options, _TEST_FILTER_OPTIONS

In [2]:
import itertools

In [3]:
l = list(itertools.product(*_TEST_FILTER_OPTIONS.values()))

In [4]:
len(l)

72

In [5]:
l

[('all', 'all', 'all', 'all', 'all', 'all'),
 ('all',
  'all',
  'all',
  'all',
  [2009, 2011, 2012, 2014, 2021, 2015, 2022],
  'all'),
 ('all', 'all', 'all', ['3.1 - Transport de marchandise amont'], 'all', 'all'),
 ('all',
  'all',
  'all',
  ['3.1 - Transport de marchandise amont'],
  [2009, 2011, 2012, 2014, 2021, 2015, 2022],
  'all'),
 ('all',
  'all',
  'all',
  ['6.1 - Autres émissions directes',
   '5.4 - Investissements',
   '1.1 - Émissions directes des sources fixes de combustion',
   '1.2 - Émissions directes des sources mobiles de combustion',
   '3.2 - Transport de marchandise aval',
   '4.1 - Achat de biens',
   '4.5 - Achat de services',
   '4.2 - Immobilisation de biens',
   '3.4 - Déplacements des visiteurs et des clients',
   '5.1 - Utilisation des produits vendus',
   '4.3 - Gestion des déchets',
   '1.3 - Émissions directes des procédés hors énergie'],
  'all',
  'all'),
 ('all',
  'all',
  'all',
  ['6.1 - Autres émissions directes',
   '5.4 - Investissements',


In [16]:
list(itertools.product([(k,v) for (k,l) in o.items() for v in l]))

[(('type_structure', 'all'),),
 (('type_structure', ['Collectivité territoriale (dont EPCI)']),),
 (('type_structure',
   ['Association', 'Collectivité territoriale (dont EPCI)']),),
 (('secteur_activite', 'all'),),
 (('secteur_activite',
   ["Commerce ; réparation d'automobiles et de motocycles"]),),
 (('secteur_activite',
   ['Agriculture, sylviculture et pêche',
    'Industrie manufacturière',
    'Activités extra-territoriales',
    "Production et distribution d'eau ; assainissement, gestion des déchets et dépollution",
    'Construction',
    "Commerce ; réparation d'automobiles et de motocycles",
    'Activités de services administratifs et de soutien',
    'Arts, spectacles et activités récréatives',
    'Administration publique',
    'Transports et entreposage']),),
 (('category_emissions', 'all'),),
 (('category_emissions', ['2 - Énergie']),),
 (('category_emissions',
   ['6 - Autres émissions indirectes',
    '2 - Énergie',
    '1 - Émissions directes']),),
 (('poste_emission