In [16]:
import os
import io
import requests
import pandas as pd
import json
from rich import print as rprint

# Load sensitive data from environment variables for security
USER_ID = os.getenv('DESTATIS_USER_ID', 'DE17T29R57')  # default values for demonstration
PASSWORD = os.getenv('DESTATIS_PASSWORD', '4Bf/3Ap)3]r2,,h')
GENESIS_URL = 'https://www-genesis.destatis.de/genesisWS/rest/2020/data/cube?'


In [6]:
tab_code = '61261BV002'
area_type = 'all'
content = 'VST066'
start_year = ''
classifier1 = 'STEMW1'
classifier2 = 'BAUIN2'
classifier3 = 'BAUIN1'
key1 = 'STEMW1'
key2 = 'BPI4'
key3 = 'BPNB525'
lang = 'en'

In [9]:
# Build the URL dynamically to include up to three classifiers
url = f"{GENESIS_URL}username={USER_ID}&password={PASSWORD}&name={tab_code}&area={area_type}&compress=true&contents={content}&startyear={start_year}&language={lang}"
url += f"&classifyingvariable1={classifier1}&classifyingkey1={key1}"
if 'classifier2' in locals() and 'key2' in locals():
    url += f"&classifyingvariable2={classifier2}&classifyingkey2={key2}"
if 'classifier3' in locals() and 'key3' in locals():
    url += f"&classifyingvariable3={classifier3}&classifyingkey3={key3}"

In [12]:
# Make the request and split the response content
response = requests.get(url)
data = json.loads(response.text)["Object"]['Content'].split('\n')

In [21]:
#rprint(data)

In [22]:
# Count the number of columns expected based on the first row of actual data (beyond header)
sample_data = data[15]  # assuming line 14 is where data starts post-header in your specific dataset
num_columns = len(sample_data.split(';'))

In [23]:
# Dynamically create the header based on the number of columns in the first row of data
base_header = ['field_D', 'field_DFDN', 'quarter', 'year', 'value', 'field_e', 'field_', 'field__']
expected_fields = num_columns - len(base_header)  # calculate how many classifyingkey fields there should be
header = ';'.join(['classifyingkey' + str(i+1) for i in range(expected_fields)] + base_header)


In [24]:
# Debug: Print out header and first few lines of data to help identify alignment issues
print("Header:", header)
print("Sample Data Line:", sample_data)

Header: classifyingkey1;classifyingkey2;field_D;field_DFDN;quarter;year;value;field_e;field_;field__
Sample Data Line: K;QEI;FACH-SCHL;FACH-SCHL;FACH-SCHL;FACH-SCHL;FACH-SCHL;ZI-WERT;WERT;QUALITAET


In [28]:
# Read the CSV data using the dynamically created header
df = pd.read_csv(io.StringIO(header + '\n' + '\n'.join(data[14:])), delimiter=';')

In [34]:
def transform_data(df):
    result = {
        'tab_code': tab_code,
        'content': content,
        'classifier1': classifier1,
        'key1': key1,
        'data': []
    }

    grouped = df.groupby('year')
    for year, group in grouped:
        year_data = {
            'year': year,
            'df': []
        }
        
        group = group.sort_values(by='quarter')
        for quarter, q_group in group.groupby('quarter'):
            quarter_data = q_group.iloc[0]  # Take the first entry if duplicates
            year_data['df'].append({
                'quarter': quarter_data['quarter'],
                'value': quarter_data['value']
            })

        result['data'].append(year_data)

    return result

# Transform the data
transformed_data = transform_data(df)



In [35]:
rprint(json.dumps(transformed_data, indent=4))
