# Introduction

This notebook serves to research the metadata of the datasets we are interested in using. The metadata will serve as a basis to do the data query and will support the data analysis.

# Involved datasets

We intend to analyse some facts about energy in the EU. For that we intend to use:


|Dataset_id|Name  |Url |
|--|--|-- |
| demo_pjan |# Population on 1 January by age and sex  |https://ec.europa.eu/eurostat/web/products-datasets/-/DEMO_PJAN|
| nrg_bal_s| Simplified Energy Balances | https://ec.europa.eu/eurostat/databrowser/view/nrg_bal_s|

In [None]:
from pysdmx.api.qb.service import RestService
from pysdmx.api.qb.structure import StructureQuery, StructureFormat, StructureType, StructureReference, StructureDetail
from pysdmx.api.qb.data import DataQuery, DataFormat
from pysdmx.api.qb.util import ApiVersion
from pysdmx.io import read_sdmx, get_datasets

api_endpoint='https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1'

# estat_webservice = RestService(
#     api_endpoint='https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1',
#     api_version= ApiVersion.V1_4_0,
#     structure_format= StructureFormat.SDMX_ML_2_1_STRUCTURE
#     )

def get_metadata_url(resource_id):
    query = StructureQuery(resource_id= resource_id, artefact_type= StructureType.DATAFLOW, references=StructureReference.DESCENDANTS, detail=StructureDetail.FULL)
    return api_endpoint + query.get_url(ApiVersion.V1_4_0)

def get_data_url(resource_id, data_selection=None, start_period=None, end_period=None):
    
    data_selection = data_selection or 'all'
    start_period = f'&startPeriod={start_period}' if start_period else ''
    end_period = f'&endPeriod={end_period}' if end_period else ''


    return f'https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/data/{resource_id}/{data_selection}?format=SDMX-CSV{start_period}{end_period}'

def get_eurostat_dataset(resource_id, data_selection=None, start_period=None, end_period=None):
    metadata = read_sdmx(get_metadata_url(resource_id))
    data = read_sdmx(get_data_url(resource_id, data_selection, start_period, end_period))
    
    metadata = metadata.get_dataflows()
    metadata = metadata[list(metadata.keys())[0]]

    dataset = data.get_datasets()[0]

    dataset.structure = metadata
    
    return dataset   


In [105]:
def get_dimensions(metadata):

    dsds = metadata.get_data_structure_definitions()
    dsd = dsds[list(dsds.keys())[0]]

    structure = {}
    structure_num = 1

    for component in dsd.components:
        if str(component.role) == 'Role.DIMENSION':
            structure[structure_num] = component.concept.id
            structure_num += 1
    return structure


def get_summary_metadata(metadata):
    dsds = metadata.get_data_structure_definitions()
    dsd = dsds[list(dsds.keys())[0]]

    dataflows = metadata.get_dataflows()
    dataflow = dataflows[list(dataflows.keys())[0]]

    summary = {}
    summary['dataflow_id'] = dataflow.id
    summary['dataflow_name'] = dataflow.name
    summary['dsd_id'] = dsd.id
    summary['dsd_name'] = dsd.name

    dimension_details = {}

    for component in dsd.components:
        if str(component.role) == 'Role.DIMENSION':
            enumeration = {}
            if component.enumeration is not None:
                for code in component.enumeration.codes:
                    enumeration[code.id] = code.name
            dimension_details[component.concept.id] = {
                'name': component.concept.name,
                'enumeration': enumeration
            }
    summary['dimensions'] = dimension_details
    return summary

def get_summary_dataset(dataset):       
    summary = {}
    summary['structure_type'] = dataset.structure.__class__.__name__
    summary['structure_id'] = dataset.structure.id
    summary['structure_name'] = dataset.structure.name
    
    if summary['structure_type'] == 'Dataflow':
        dsd = dataset.structure.structure
    elif summary['structure_type'] == 'DataStructureDefinition':
        dsd = dataset.structure

    dsd_dimensions = {}
    for component in dsd.components:
        if str(component.role) == 'Role.DIMENSION':
            enumeration = {}
            if component.enumeration is not None:
                for code in component.enumeration.codes:
                    enumeration[code.id] = code.name
            dsd_dimensions[component.concept.id] = {
                'name': component.concept.name,
                'enumeration': enumeration
            }

    data_effective_structure_summary = {}

    summary['dsd_dimensions'] = dsd_dimensions
    
    for component in dataset.data.columns:
        if component in dsd_dimensions.keys():
            data_effective_structure_summary[component] = {
                'code': component,
                'name': dsd_dimensions[component]['name'],
                'enumeration': list(dataset.data[component].unique())
            }
    summary['data_effective_structure_summary'] = data_effective_structure_summary
    return summary


We need to understand the dataset before using it.
We should do this by seeing the data structure (i.e., the columns and the possible values for each of them).
In principle this is what DSD provides, but the codelists may be to broad

#TODO: CHECK IF CONSTRAINTS ARE USED IN THIS CASE

Anyhow, not all the agencies follow same practices. So an efective way to check the effective structure is by getting a subset of the data

In [107]:
# demo_pjan_metadata = read_sdmx(get_metadata_url('demo_pjan'))
# demo_pjan = get_datasets(data= get_data_url('demo_pjan', start_period='2023') , structure=get_metadata_url('demo_pjan'), validate=False)
demo_pjan = get_eurostat_dataset('demo_pjan', start_period='2023')

In [108]:
get_summary_dataset(demo_pjan)

{'structure_type': 'Dataflow',
 'structure_id': 'DEMO_PJAN',
 'structure_name': 'Population on 1 January by age and sex',
 'dsd_dimensions': {'freq': {'name': 'Time frequency',
   'enumeration': {'P': 'Pluri-annual',
    'A': 'Annual',
    'S': 'Half-yearly, semesterly',
    'Q': 'Quarterly',
    'M': 'Monthly',
    'W': 'Weekly',
    'B': 'Daily - business week',
    'D': 'Daily',
    'H': 'Hourly',
    'I': 'Irregular / A-periodic',
    'NAP': 'Not applicable'}},
  'unit': {'name': 'Unit of measure',
   'enumeration': {'TOTAL': 'Total',
    'NR': 'Number',
    'NR_HAB': 'Number per inhabitant',
    'THS': 'Thousand',
    'MIO': 'Million',
    'BN': 'Billion',
    'CT': 'Euro cent',
    'EUR': 'Euro',
    'THS_EUR': 'Thousand euro',
    'MIO_EUR': 'Million euro',
    'BN_EUR': 'Billion euro',
    'MEUR_KP21': 'Million euro (at constant 2021 prices)',
    'MEUR_KP15': 'Million euro (at constant 2015 prices)',
    'MEUR_KP11': 'Million euro (at constant 2011 prices)',
    'MEUR_KP10': '

With this summary we can see what we are interested in. We can create a dictionary that we will use later to query the right data.

We leave the countries open, since this is something we want to modify later during the data analysis:

In [None]:
demo_pjan_constraints={
    'age': 'TOTAL',
    'sex': 'T',
    'unit': 'NR',
    'age': 'TOTAL',
    }

In [113]:
nrg_bal_s = get_eurostat_dataset('nrg_bal_s', start_period='2023')

In [116]:
get_summary_dataset(nrg_bal_s)

{'structure_type': 'Dataflow',
 'structure_id': 'NRG_BAL_S',
 'structure_name': 'Simplified energy balances',
 'dsd_dimensions': {'freq': {'name': 'Time frequency',
   'enumeration': {'P': 'Pluri-annual',
    'A': 'Annual',
    'S': 'Half-yearly, semesterly',
    'Q': 'Quarterly',
    'M': 'Monthly',
    'W': 'Weekly',
    'B': 'Daily - business week',
    'D': 'Daily',
    'H': 'Hourly',
    'I': 'Irregular / A-periodic',
    'NAP': 'Not applicable'}},
  'nrg_bal': {'name': 'Energy balance',
   'enumeration': {'TOTAL': 'Total',
    'PRD': 'Production',
    'PPRD': 'Primary production',
    'PPRD_RED': 'Primary production - Renewable Energy Directive',
    'IPRD': 'Indigenous production',
    'IPRD_AG': 'Indigenous production - associated gas',
    'IPRD_NAG': 'Indigenous production - non-associated gas',
    'IPRD_CG': 'Indigenous production - colliery gas',
    'IPRD_SB': 'Indigenous production - solid biofuels',
    'IPRD_LB': 'Indigenous production - liquid biofuels',
    'IPRD_GB'

In [None]:
{
    'nrg_bal_total_consumption': ['FC_E'],
    'nrg_bal_consumption_basic': ['FC_IND_E','FC_TRA_E','FC_OTH_E'],
    'nrg_bal_consumption_industry_full': [
        'FC_IND_CON_E','FC_IND_CPC_E','FC_IND_FBT_E','FC_IND_IS_E','FC_IND_MAC_E','FC_IND_MQ_E',
        'FC_IND_NFM_E','FC_IND_NMM_E','FC_IND_NSP_E','FC_IND_PPP_E','FC_IND_TE_E','FC_IND_TL_E','FC_IND_WP_E'],
    'nrg_bal_consumption_other_full': ['FC_OTH_AF_E', 'FC_OTH_CP_E', 'FC_OTH_FISH_E', 'FC_OTH_HH_E', 'FC_OTH_NSP_E'],
    'nrg_bal_consumption_transport_full': ['FC_TRA_DAVI_E', 'FC_TRA_DNAVI_E', 'FC_TRA_NSP_E', 'FC_TRA_PIPE_E',
                                           'FC_TRA_RAIL_E', 'FC_TRA_ROAD_E',],
    'nrg_bal_siec_total': ['TOTAL'],
    'nrg_bal_siec_basic':[
    'P1000', 'S2000', 'G3000', 'E7000', 'H8000'
]}