In [None]:
from bs4 import BeautifulSoup
import json
import pandas as pd

def parse_html_table_with_col_row_span(table):
    n_columns = 0
    n_rows=0
    column_names = []

    for row in table.find_all('tr'):
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
                n_columns = len(td_tags)

        th_tags = row.find_all('th') 
        if len(th_tags) > 0 and len(column_names) == 0:
            for th in th_tags:
                column_names.append(th.get_text())

    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns, index= range(0,n_rows))
    row_marker = 0
    for row in table.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            if column.has_attr("colspan"):
                for j in range(1, int(column["colspan"])):
                    column_marker = column_marker + 1
                    df.iat[row_marker,column_marker] = column.get_text()
            if column.has_attr("rowspan"):
                for i in range(1, int(column["rowspan"])):
                    df.iat[row_marker+i, column_marker] = column.get_text()  
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1
    return df


html_content = """
<table>
    <tr><td rowspan="2">A</td><td>B</td></tr>
    <tr><td>C</td></tr>
</table>"""

soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find_all('table')[0]  

df = parse_html_table_with_col_row_span(table)

# convert the dataframe to json string
json_str = df.to_json(orient='records')
print(json_str)


In [3]:
from bs4 import BeautifulSoup
import pandas as pd

# html_string could be the html content you get from webpage
html_string = """
<table>
    <tr class="table-header_blue">
        <td colspan="1" rowspan="2">№</td>
        <td colspan="1" rowspan="2">Услуга</td>
        <td colspan="2" rowspan="1">Стоимость независимой досудебной экспертизы, руб.</td>
        <td colspan="2" rowspan="1">Стоимость судебной экспертизы, руб</td>
    </tr>
    <tr class="table-header_blue">
        <td>Легковые а/м</td>
        <td>Грузовые а/м и автобусы</td>
        <td>Легковые а/м</td>
        <td>Грузовые а/м и автобусы</td>
    </tr>
</table>
"""

# Use BeautifulSoup to parse the html
soup = BeautifulSoup(html_string, 'html.parser')

# Find the table 
table = soup.find_all('table')[0]

# Use Pandas to transform table data into a dataframe
df = pd.read_html(str(table), header=0, flavor='bs4')[0]

# transform dataframe to json
json_str = df.to_json(orient='records')

print(json_str)

[{"\u2116":"\u2116","\u0423\u0441\u043b\u0443\u0433\u0430":"\u0423\u0441\u043b\u0443\u0433\u0430","\u0421\u0442\u043e\u0438\u043c\u043e\u0441\u0442\u044c \u043d\u0435\u0437\u0430\u0432\u0438\u0441\u0438\u043c\u043e\u0439 \u0434\u043e\u0441\u0443\u0434\u0435\u0431\u043d\u043e\u0439 \u044d\u043a\u0441\u043f\u0435\u0440\u0442\u0438\u0437\u044b, \u0440\u0443\u0431.":"\u041b\u0435\u0433\u043a\u043e\u0432\u044b\u0435 \u0430\/\u043c","\u0421\u0442\u043e\u0438\u043c\u043e\u0441\u0442\u044c \u043d\u0435\u0437\u0430\u0432\u0438\u0441\u0438\u043c\u043e\u0439 \u0434\u043e\u0441\u0443\u0434\u0435\u0431\u043d\u043e\u0439 \u044d\u043a\u0441\u043f\u0435\u0440\u0442\u0438\u0437\u044b, \u0440\u0443\u0431..1":"\u0413\u0440\u0443\u0437\u043e\u0432\u044b\u0435 \u0430\/\u043c \u0438 \u0430\u0432\u0442\u043e\u0431\u0443\u0441\u044b","\u0421\u0442\u043e\u0438\u043c\u043e\u0441\u0442\u044c \u0441\u0443\u0434\u0435\u0431\u043d\u043e\u0439 \u044d\u043a\u0441\u043f\u0435\u0440\u0442\u0438\u0437\u044b, \u0440\u044

In [18]:
from bs4 import BeautifulSoup
import json

def html_table_to_json(html_table):
    soup = BeautifulSoup(html_table, 'html.parser')

    headers_trs = soup.find_all('tr', {'class': 'table-header_blue'})
    tr = headers_trs[0]
    h = []
    h2 = []
    for td in tr.find_all('td'):
        h.append(td.text)
        if td.has_attr("colspan"):
            for j in range(1, int(td["colspan"])):
                h.append(td.text)
        if td.has_attr("rowspan"):
            rs =int(td["rowspan"])
            if rs>1:
                for j in range(1, rs):
                    h2.append("")
    tr = headers_trs[1]

    for td in tr.find_all('td'):
        h2.append(td.text)
    
    # Combine multi-level headers
    headers = list(zip(h, h2))
    print(headers)

    rows = soup.find_all('tr')
    table_data = []
    for row in rows:
        if  "class" in row.attrs:
            continue
        cells = row.find_all('td')
        row_data = dict(zip(headers, [cell.text for cell in cells]))
        table_data.append(row_data)

    #return json.dumps(table_data, indent=4, ensure_ascii=False)
    print(table_data)

html_table = html_string
html_table_to_json(html_table)
#print(html_table_to_json(html_table))

[('№', ''), ('Услуга', ''), ('Стоимость независимой досудебной экспертизы, руб.', 'Легковые а/м'), ('Стоимость независимой досудебной экспертизы, руб.', 'Грузовые а/м и автобусы'), ('Стоимость судебной экспертизы, руб', 'Легковые а/м'), ('Стоимость судебной экспертизы, руб', 'Грузовые а/м и автобусы')]
[]
