In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
df = pd.read_excel('output-1.xlsx')

In [3]:
def find_matches(pattern):
    # return values and indices of all matching values
    result_dict = {}

    for col_index, col in enumerate(df.columns):
        for row_index, value in enumerate(df[col]):
            matches = re.findall(pattern, str(value))
            if matches:
                for match in matches:
                    result_dict[match] = {'row' : row_index, 'col' : col_index}

    return pd.DataFrame.from_dict(result_dict, orient='index').reset_index(names=['value', 'row', 'col'])


def convert_to_float(value):
    try:
        # Replace commas, replace spaces, convert to float
        return float(value.replace(' ', '').replace(',', '.'))
    except (ValueError, AttributeError):
        # If conversion fails (NaN or other), return None
        return None

In [4]:
def extract_table_values(df):
    c_pattern = r'C\d{4}'
    r_pattern = r'R\d{4}'

    c_matches = find_matches(c_pattern)
    if c_matches['row'].nunique() != 1:
        raise ValueError('Multiple rows found for C pattern - weird table format. Aborting...')
    c_count = len(c_matches)
    r_matches = find_matches(r_pattern)

    dfs_to_union = []
    for _, r_match in r_matches.iterrows():
        try:
            result_vector = df.iloc[r_match['row'], r_match['col'] + 1: r_match['col'] + c_count + 1].values
            # Apply the function to the example vector
            result_vector = np.array([convert_to_float(val) for val in result_vector])
            result  = pd.DataFrame({'WIERSZ': np.full(c_count, r_match.value),'KOLUMNA' : c_matches['value'].values, 'WARTOŚĆ' : result_vector})
            dfs_to_union.append(result)
        except Exception as e:
            print(e)
    return pd.concat(dfs_to_union, ignore_index=True)

In [5]:
result = extract_table_values(df)
result.to_csv('test.csv')