In [8]:
import json
import requests
from decimal import Decimal
import io
from rich import print as rprint
import pandas as pd
import re  # Import regular expressions
import pprint

genesis_url = 'https://www-genesis.destatis.de/genesisWS/rest/2020/data/cube?'
user_id = 'DE17T29R57'
password = '4Bf/3Ap)3]r2,,h'

In [2]:
cubecode = '61111BM006'
areatype = 'all'
category = 'all'
content = ''
start_year = ''
classifier1 = 'CC13B1'
key1 = 'CC13-77'
lang = 'en'
pagelength = "20"

# Destatis Base Url
url = (f"{genesis_url}username={user_id}&password={password}&language=de&name={cubecode}&area={areatype}"
       f"&compress=true&contents={content}&startyear={start_year}")

# Add Classifyers to url
if classifier1 != None:
       url += f'&classifyingvariable1={classifier1}'    
if key1 != None:
       url += f'&classifyingkey1={key1}'

In [5]:
# request 
response = requests.get(url)
# First, parse the JSON response with Decimal conversion for floats
#Float types are not supported with dynamodb; use Decimal types instead
parsed_json = json.loads(response.text, parse_float=Decimal)

# Now, access the 'Content' part and split it by newline
data = parsed_json["Object"]['Content'].split('\n')

In [6]:
rprint(data)

In [18]:
# Compile a regex pattern to find sequences of digits ending with "=100"
info_pattern = re.compile(r'\d+=100')

# Extract only the parts of lines that match the pattern "*=100"
info_matches = []
for line in data:
    matches = info_pattern.findall(line)  # find all matches in a line
    info_matches.extend(matches)  # add found matches to the list

# Convert info_matches to a single string to append to each filtered line
info_matches_str = ', '.join(info_matches)

# Filter out the relevant lines and append the matches string to each
filtered_response = [line + " " + info_matches_str for line in data if 'D' in line and 'e' in line and ('MONAT' in line or 'QUART' in line)]

# Output or further processing of filtered_response as required
rprint(filtered_response)


In [16]:
# Split each line into its components and extract the relevant information
#data = []
#for line in filtered_response:
#    parts = line.split(';')
#    year = int(parts[4])
#    period = parts[3]
#    value = float(parts[5])
#    classifyingkey1 = parts[2]
#    data.append({'year': year, 'period': period, 'value': value, 'classifyingkey1': classifyingkey1})

# Split each line into its components and extract the relevant information

data = [{'field_D': parts[0],
         'field_DG': parts[1],
         'classifyingkey1': parts[2],
         'period': parts[3],
         'year': int(parts[4]),
         'value': float(parts[5]),
         'field_e': parts[6]
        }
        for parts in (line.split(';') for line in filtered_response)]

rprint(data)

In [22]:
# Create a DataFrame from the extracted data
df = pd.DataFrame(data)
rprint(df)

In [24]:
# Determine if the data is monthly or quarterly based on the unique values in the 'period' column
if df['period'].str.contains('Q').any():
    period_key = 'quarter'
else:
    period_key = 'month'

# Create result dict
result = {'cubeCode': cubecode, 'content': content, 'classifyingVar1': classifier1, 'classifyingKey1': key1, 'data': []}

In [25]:
for year, group in df.groupby('year'):
    year_data = {'year': year, 'df': [{period_key: row['period'], 'value': row['value']} for _, row in group.iterrows()]}
    result['data'].append(year_data)
    
rprint(json.dumps(result, indent=4))