## Category

In [1]:
import pandas as pd
from IPython.display import Markdown as md

from utils.utils import wrapper_engine, get_value_from_json_serialization
engine = wrapper_engine('config.ini')

from utils import custom_settings
from itables import show
custom_settings.apply_itable_custom_settings()

# Load data
with engine.connect() as connection:
    package_extra = pd.read_sql_table(
        table_name="package_extra", 
        con=connection,
        schema="public",
        )
    
category = package_extra[package_extra["key"]=="category"].copy(deep=True)

show(category[["package_id", "value"]].rename(columns={"value":"category"}).describe())

Unnamed: 0,package_id,category
Loading... (need help?),,


In [2]:
category["cateogries_uri_list"] = category["value"].apply(lambda x: get_value_from_json_serialization(x, key="uri"))
category["nb_categories"] = category["cateogries_uri_list"].apply(lambda x: len(x))

# Nombre de catégories par dataset
import plotly
import plotly.express as px
plotly.offline.init_notebook_mode(connected=True)

fig = px.histogram(category, 
                   x="nb_categories",
                   template="seaborn",
                   labels={"nb_categories": "Nombre de categories par dataset",},
                   color_discrete_sequence=['#000091'])
fig.update_layout(bargap=0.2,
                  yaxis_title="Nombre de datasets",)
fig.update_traces(opacity=0.8)
plotly.offline.iplot(fig)

In [3]:
columns = []
for i, _ in enumerate(category["nb_categories"].unique()):
    column = "category_" + str(i)
    category[column] = None
    columns.append(column)

category[columns] = pd.DataFrame(category["cateogries_uri_list"].tolist(),  index= category.index)

melted_categories = pd.melt(
    category,
    id_vars="package_id",
    value_vars=columns,
    value_name="category"
).dropna().drop(columns="variable")

In [6]:
# See: https://github.com/ecolabdata/ckanext-dsfr/blob/main/ckanext/theme_design_system_fr/public/js/ecosphere/vocabularies/themes_jsonld.json
# User right error when reading table public.ecospheres_theme_hierarchy
# Alternatively, ckanext-ecospheres was imported in the external folder and the VocabularyIndex class called to directly read values from register

import sys
sys.path.append('external/ckanext-ecospheres')
from ckanext.ecospheres.vocabulary.index import VocabularyIndex

ecospheres_themes = VocabularyIndex.load("ecospheres_theme")

hierarchy_themes = pd.DataFrame(ecospheres_themes.data.hierarchy)
hierachy_themes_dict = dict(zip(hierarchy_themes["child"], hierarchy_themes["parent"]))

label_themes = pd.DataFrame(ecospheres_themes.data.label)
label_themes_dict = dict(zip(label_themes["uri"], label_themes["label"]))


In [7]:
def find_parent(uri:str):
    try:
        return hierachy_themes_dict[uri]
    except Exception as exception:
        print("No parent for uri: ", uri)
        return None
    
melted_categories["parent"] = melted_categories["category"].apply(lambda x: find_parent(x))
melted_categories.dropna(subset=["parent"], inplace=True)
melted_categories = melted_categories.groupby(by=["parent", "category"], as_index=False).count()

No parent for uri:  http://registre.data.developpement-durable.gouv.fr/ecospheres/themes-ecospheres/mer-et-littoral
No parent for uri:  http://registre.data.developpement-durable.gouv.fr/ecospheres/themes-ecospheres/mer-et-littoral
No parent for uri:  http://registre.data.developpement-durable.gouv.fr/ecospheres/themes-ecospheres/mer-et-littoral
No parent for uri:  http://registre.data.developpement-durable.gouv.fr/ecospheres/themes-ecospheres/reseaux-d-energie-et-de-communication
No parent for uri:  http://registre.data.developpement-durable.gouv.fr/ecospheres/themes-ecospheres/eau
No parent for uri:  http://registre.data.developpement-durable.gouv.fr/ecospheres/themes-ecospheres/reseaux-d-energie-et-de-communication
No parent for uri:  http://registre.data.developpement-durable.gouv.fr/ecospheres/themes-ecospheres/reseaux-d-energie-et-de-communication
No parent for uri:  http://registre.data.developpement-durable.gouv.fr/ecospheres/themes-ecospheres/reseaux-d-energie-et-de-communicat

In [8]:
def find_label(uri:str)->str:
    try:
        return label_themes_dict[uri]
    except Exception as exception:
        print("No label found for uri: ", uri)
        return None
    
melted_categories["label_parent"] = melted_categories["parent"].apply(lambda x: find_label(x))
melted_categories["label_child"] = melted_categories["category"].apply(lambda x: find_label(x))

In [9]:
parent_sum = []
parents = []
for parent in melted_categories["label_parent"].unique():
    parents.append(parent)
    parent_sum.append(melted_categories[melted_categories["label_parent"]==parent]["package_id"].sum())

add_parent_counts = pd.DataFrame(
    {
        "label_parent":[""] + ["Ecosphères"]*len(parents),
        "label_child": ["Ecosphères"] + parents,
        "package_id": [melted_categories["package_id"].sum()] + parent_sum
    })

melted_categories = pd.concat([melted_categories, add_parent_counts])

In [10]:

plotly.offline.init_notebook_mode(connected=True)
fig =px.treemap(
    names=melted_categories["label_child"].to_list(),
    parents=melted_categories["label_parent"].to_list(),
    values=melted_categories["package_id"].to_list(),
    branchvalues="total",
)
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
plotly.offline.iplot(fig)