# Imports

In [1]:
import pandas as pd
from helpers import get_meta_info, read_specific_data, get_jahr_abrechnung, fill_down, clean_ebene_4, starts_with_roman_numeral, no_summe_after_roman, write_to_db
import logging.config
import warnings

warnings.filterwarnings("ignore", category=pd.core.generic.SettingWithCopyWarning)

# Configure logging using the external configuration file
logging.config.fileConfig('logging_config.ini')

# Use the logger in your code
logger = logging.getLogger("mainLogger")


# Definitions

In [2]:
file_name = 'JA 2021 MG_2.xls'
sheet_name = "B_KIGA"
file_path, _, _ = get_meta_info(file_name, 1, 1)

# Code

In [19]:
skip_to = 10
file_path, _, _ = get_meta_info(file_name, 1, 1)
try:
    df = pd.read_excel(file_path, sheet_name=sheet_name, usecols="C",  skiprows=range(1, skip_to), nrows=1000)
except ValueError as e:
    logger.error(e)
    raise


marker_a = 'A. AUFWAND'
marker_b = 'B. ERTRÄGE'

# Iterate through the DataFrame and set 'ebene_1' based on the markers
current_ebene_1 = ''
for idx, row in df.iterrows():
    value = row['B_KIGA']
    if value == marker_a:
        current_ebene_1 = 'A. AUFWAND'
    elif value == marker_b:
        current_ebene_1 = 'B. ERTRÄGE'
    df.at[idx, 'ebene_1'] = current_ebene_1


# reihen rausfiltern basierend auf Nummerierung
desired_rows = df[df[df.columns[0]].apply(lambda x: isinstance(x, str) and starts_with_roman_numeral(x))]
row_numbers = desired_rows.index +  skip_to + 1
desired_rows.reset_index(inplace=True, drop=True)
desired_rows['start_row'] = row_numbers
desired_rows['end_row'] = ''
desired_rows.columns.values[0] = "ebene_2"

desired_rows['end_row'] = desired_rows['start_row'].shift(-1)
desired_rows['end_row'] = desired_rows['end_row']-1
desired_rows.fillna(159, inplace=True) # letzte Reihe stopp bei 158


all_df = pd.DataFrame()
ebene_3 = ''

for idx, row in desired_rows.iterrows():
    ebene_1 = row['ebene_1']
    ebene_2 = row['ebene_2']
    start_row = row['start_row']
    end_row =  row['end_row']


    for year in [2020, 2021]:
        usecols = "C,D,F, G" if year == 2020 else "C,E,F, G"
        df_ebene_2 = read_specific_data(file_name, start_row, end_row, sheet_name, usecols, ebene_1, ebene_2, ebene_3)

    

        df_ebene_2.rename(columns={df_ebene_2.columns[0]: 'ebene_4'}, inplace=True)
        df_ebene_2['ebene_3'] = df_ebene_2['ebene_4']
        
        # "bug" siehe Z. 99 / 100 in Excel
        df_ebene_2 = df_ebene_2[df_ebene_2['ebene_3'] != 0]
        df_ebene_2.reset_index(drop=True, inplace=True)

        df_ebene_2['year'] = year
        df_ebene_2['Typ'] = "TBD"
        
        df_ebene_2.columns.values[2] = "Abweichung"
        df_ebene_2.columns.values[1] = "kennzahl"
        df_ebene_2.columns.values[3] = "Kommentar"


    df = fill_down(df_ebene_2)
    df = clean_ebene_4(df)
    all_df = pd.concat([all_df, df], ignore_index=True)

Unnamed: 0,ebene_2,ebene_1,start_row,end_row
0,I. PERSONALAUFWAND 1),A. AUFWAND,12,53.0
1,I. SUMME - PERSONALAUFWAND,A. AUFWAND,54,54.0
2,II. SACHAUFWAND,A. AUFWAND,55,97.0
3,II. SUMME - SACHAUFWAND,A. AUFWAND,98,99.0
4,III. RÜCKFORDERUNGEN DER MA 10 8),A. AUFWAND,100,103.0
5,III. SUMME - RÜCKFORDERUNGEN DER MA 10,A. AUFWAND,104,106.0
6,I. SUMME - Personalaufwand,A. AUFWAND,107,107.0
7,II. SUMME - Sachaufwand,A. AUFWAND,108,108.0
8,III. SUMME - Rückforderungen der MA 10,A. AUFWAND,109,113.0
9,I. BETRIEBLICHE ERTRÄGE,B. ERTRÄGE,114,135.0
