In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import random

## Creating some data based off the html 
#### Can be done N times in a loop to increase dataset size
#### Can grab other html from the pdf as well to increase variance

In [2]:
source_document = """
<html>
<table><thead><tr><th colspan="3"><strong>Noise Charges</strong></th></tr><tr><th>QC</th><th>Set fee per Tonne 2025<br>Day</th><th>Set fee per Tonne 2025<br>Night</th></tr></thead><tbody><tr><td>0</td><td><span style="color: green;">€0.00</span></td><td><span style="color: green;">€0.00</span></td></tr><tr><td>0.125</td><td><span style="color: green;">€0.00</span></td><td><span style="color: green;">€0.00</span></td></tr><tr><td>0.25</td><td><span style="color: green;">€0.00</span></td><td><span style="color: green;">€0.00</span></td></tr><tr><td>0.5</td><td><span style="color: green;">€0.00</span></td><td><span style="color: green;">€2.00</span></td></tr><tr><td>1</td><td><span style="color: green;">€1.00</span></td><td><span style="color: green;">€4.00</span></td></tr><tr><td>2</td><td><span style="color: green;">€2.00</span></td><td><span style="color: green;">€8.00</span></td></tr><tr><td>4</td><td><span style="color: green;">€4.00</span></td><td><span style="color: green;">€12.00</span></td></tr><tr><td>8</td><td><span style="color: green;">€6.00</span></td><td><span style="color: green;">€16.00</span></td></tr><tr><td>16</td><td><span style="color: green;">€8.00</span></td><td><span style="color: green;">€20.00</span></td></tr></tbody></table>
</html>
"""


soup = BeautifulSoup(source_document, "html.parser")

#Find all value cells (inside <span>)
for span in soup.find_all("span"):

    new_value = round(random.uniform(0, 25), 2)
    span.string = f"€{new_value:.2f}"


source_tables = str(soup) #extract tables from html - here only one
source_tables = pd.read_html(source_tables) # convert to DataFrames

  source_tables = pd.read_html(source_tables) # convert to DataFrames


The "source table" can be considered our "target" for prediction - for a given document we want to extract/create a table in a standard format that we can use to compute the values

In [3]:
def compute_noise_charge(noise_charge_datatable, qc, weight, day_night):
    """Computes the noise charge based on QC, weight, and time of day."""
    row = noise_charge_datatable[noise_charge_datatable[('Noise Charges', 'QC')] == qc]
    if row.empty:
        return None
    
    if day_night.lower() == 'day':
        fee = row[('Noise Charges', 'Set fee per Tonne 2025 Day')].values[0]
    elif day_night.lower() == 'night':
        fee = row[('Noise Charges', 'Set fee per Tonne 2025 Night')].values[0]
    else:
        raise ValueError("day_night must be 'day' or 'night'")
    
    total_fee = float(fee.replace('€', '').replace(',', '.')) * weight
    return total_fee

In [4]:
# our current dataset has a single item, the input doc and the target noise charge table.
# In general we will have many targets, and maybe as formula rather than tables
dataset = [{"doc": source_tables, "noise_charge_table": source_tables[0]}]

Now we want to create new "sythetic" examples for this dataset to test prediction

A simple example is just to shuffle the data in the table

In [5]:
noise_charge_table = source_tables[0]  # Assuming the first table is the one we want for now
num_samples = 2
noise_charge_tables = [noise_charge_table.sample(frac=1).reset_index(drop=True) for _ in range(num_samples)]

In [6]:
# the original in the correct format
noise_charge_table

Unnamed: 0_level_0,Noise Charges,Noise Charges,Noise Charges
Unnamed: 0_level_1,QC,Set fee per Tonne 2025 Day,Set fee per Tonne 2025 Night
0,0.0,€12.32,€16.78
1,0.125,€6.91,€7.36
2,0.25,€11.38,€0.44
3,0.5,€1.48,€7.08
4,1.0,€21.82,€11.37
5,2.0,€5.16,€10.44
6,4.0,€21.44,€6.69
7,8.0,€4.18,€6.99
8,16.0,€20.42,€10.61


In [7]:
# sythetic data in the "wrong format" (although here formula would still work)
noise_charge_tables[0]

Unnamed: 0_level_0,Noise Charges,Noise Charges,Noise Charges
Unnamed: 0_level_1,QC,Set fee per Tonne 2025 Day,Set fee per Tonne 2025 Night
0,0.125,€6.91,€7.36
1,0.5,€1.48,€7.08
2,2.0,€5.16,€10.44
3,0.0,€12.32,€16.78
4,0.25,€11.38,€0.44
5,4.0,€21.44,€6.69
6,16.0,€20.42,€10.61
7,1.0,€21.82,€11.37
8,8.0,€4.18,€6.99


In [8]:
noise_charge_tables[1]

Unnamed: 0_level_0,Noise Charges,Noise Charges,Noise Charges
Unnamed: 0_level_1,QC,Set fee per Tonne 2025 Day,Set fee per Tonne 2025 Night
0,0.125,€6.91,€7.36
1,16.0,€20.42,€10.61
2,1.0,€21.82,€11.37
3,0.5,€1.48,€7.08
4,4.0,€21.44,€6.69
5,0.0,€12.32,€16.78
6,0.25,€11.38,€0.44
7,8.0,€4.18,€6.99
8,2.0,€5.16,€10.44


In [9]:
# we could also peform some text augmentation here, e.g. by altering the table header
# there are standard packages for this, but here we do it manually for example

def augment_table_header(table):
    """Augments the table header by modifying the column names."""
    # lower case
    new_columns = [(col[0].lower(), col[1].lower()) for col in table.columns]
    # add a prefix
    new_columns = [(f"augmented_{col[0]}", col[1]) for col in new_columns]
    # set the new columns
    table.columns = pd.MultiIndex.from_tuples(new_columns)
    return table

# Augment the header of the first noise charge table
augmented_noise_charge_table = augment_table_header(noise_charge_tables[0])

In [10]:
augmented_noise_charge_table

Unnamed: 0_level_0,augmented_noise charges,augmented_noise charges,augmented_noise charges
Unnamed: 0_level_1,qc,set fee per tonne 2025 day,set fee per tonne 2025 night
0,0.125,€6.91,€7.36
1,0.5,€1.48,€7.08
2,2.0,€5.16,€10.44
3,0.0,€12.32,€16.78
4,0.25,€11.38,€0.44
5,4.0,€21.44,€6.69
6,16.0,€20.42,€10.61
7,1.0,€21.82,€11.37
8,8.0,€4.18,€6.99


In [12]:
#Now convert back to html

sythetic_tables = [table.to_html(index=False, escape=False) for table in noise_charge_tables + [augmented_noise_charge_table]]

# Re-embed in html structure to generate synthetic documents

synthetic_documents = []
for table in sythetic_tables:
    synthetic_document = f"""
    <html>
    <table>{table}</table>
    </html>
    """
    synthetic_documents.append(synthetic_document)

Now we can put it together into a dataset which includes the ground-truth

In [13]:
for i in range(len(synthetic_documents)):
    dataset.append({
        "doc": synthetic_documents[i],
        "noise_charge_table": noise_charge_table #note that here the target is the correctly formatted original table
    })

In [15]:
len(dataset) #now have 4 items in the dataset...

4

All items in the dataset currently have the same target, because we have only changed the formatting and not the parameters set by the airport.

To create new targets (for just this table), we can perturb the values

In [29]:
noise_charge_table

Unnamed: 0_level_0,Noise Charges,Noise Charges,Noise Charges
Unnamed: 0_level_1,QC,Set fee per Tonne 2025 Day,Set fee per Tonne 2025 Night
0,0.0,€12.32,€16.78
1,0.125,€6.91,€7.36
2,0.25,€11.38,€0.44
3,0.5,€1.48,€7.08
4,1.0,€21.82,€11.37
5,2.0,€5.16,€10.44
6,4.0,€21.44,€6.69
7,8.0,€4.18,€6.99
8,16.0,€20.42,€10.61


In [28]:
# change some values in the noise charge table to simulate different scenarios
tmp = noise_charge_table.copy()
tmp.loc[tmp[('Noise Charges', 'QC')] == 0.5, ('Noise Charges', 'Set fee per Tonne 2025 Day')] = "€3.00"
tmp.loc[tmp[('Noise Charges', 'QC')] == 0.5, ('Noise Charges', 'Set fee per Tonne 2025 Night')] = "€5.00"

In [30]:
tmp

Unnamed: 0_level_0,Noise Charges,Noise Charges,Noise Charges
Unnamed: 0_level_1,QC,Set fee per Tonne 2025 Day,Set fee per Tonne 2025 Night
0,0.0,€12.32,€16.78
1,0.125,€6.91,€7.36
2,0.25,€11.38,€0.44
3,0.5,€3.00,€5.00
4,1.0,€21.82,€11.37
5,2.0,€5.16,€10.44
6,4.0,€21.44,€6.69
7,8.0,€4.18,€6.99
8,16.0,€20.42,€10.61


In [36]:

# Re-embed in html structure to generate synthetic documents

synthetic_document_with_different_target = f"""
<html>
<table>{tmp}</table>
</html>
"""

In [37]:
print(synthetic_document_with_different_target)


<html>
<table>  Noise Charges                                                        
             QC Set fee per Tonne 2025 Day Set fee per Tonne 2025 Night
0         0.000                     €12.32                       €16.78
1         0.125                      €6.91                        €7.36
2         0.250                     €11.38                        €0.44
3         0.500                      €3.00                        €5.00
4         1.000                     €21.82                       €11.37
5         2.000                      €5.16                       €10.44
6         4.000                     €21.44                        €6.69
7         8.000                      €4.18                        €6.99
8        16.000                     €20.42                       €10.61</table>
</html>



In [38]:
dataset.append({
    "doc": synthetic_document_with_different_target,
    "noise_charge_table": tmp  #this one has a different target noise_charge_table
})

In [40]:
len(dataset)

5

## Task

We want to predict the noise_charge table from each html doc

This needs to be in the standardised format so that it can be used with our formulas.

We could also generate the formulas themselves as a next step...