In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import random

## Creating some data based off the html 
#### Can be done N times in a loop to increase dataset size
#### Can grab other html from the pdf as well to increase variance

In [3]:
html = """
<html>
<table><thead><tr><th colspan="3"><strong>Noise Charges</strong></th></tr><tr><th>QC</th><th>Set fee per Tonne 2025<br>Day</th><th>Set fee per Tonne 2025<br>Night</th></tr></thead><tbody><tr><td>0</td><td><span style="color: green;">€0.00</span></td><td><span style="color: green;">€0.00</span></td></tr><tr><td>0.125</td><td><span style="color: green;">€0.00</span></td><td><span style="color: green;">€0.00</span></td></tr><tr><td>0.25</td><td><span style="color: green;">€0.00</span></td><td><span style="color: green;">€0.00</span></td></tr><tr><td>0.5</td><td><span style="color: green;">€0.00</span></td><td><span style="color: green;">€2.00</span></td></tr><tr><td>1</td><td><span style="color: green;">€1.00</span></td><td><span style="color: green;">€4.00</span></td></tr><tr><td>2</td><td><span style="color: green;">€2.00</span></td><td><span style="color: green;">€8.00</span></td></tr><tr><td>4</td><td><span style="color: green;">€4.00</span></td><td><span style="color: green;">€12.00</span></td></tr><tr><td>8</td><td><span style="color: green;">€6.00</span></td><td><span style="color: green;">€16.00</span></td></tr><tr><td>16</td><td><span style="color: green;">€8.00</span></td><td><span style="color: green;">€20.00</span></td></tr></tbody></table>
</html>
"""


soup = BeautifulSoup(html, "html.parser")

# Find all value cells (inside <span>)
for span in soup.find_all("span"):
    # Replace the € value with a random number
    new_value = round(random.uniform(0, 25), 2)
    span.string = f"€{new_value:.2f}"

# Output updated HTML as string
updated_html = str(soup)

print(updated_html)




<html>
<table><thead><tr><th colspan="3"><strong>Noise Charges</strong></th></tr><tr><th>QC</th><th>Set fee per Tonne 2025<br/>Day</th><th>Set fee per Tonne 2025<br/>Night</th></tr></thead><tbody><tr><td>0</td><td><span style="color: green;">€2.95</span></td><td><span style="color: green;">€11.11</span></td></tr><tr><td>0.125</td><td><span style="color: green;">€7.11</span></td><td><span style="color: green;">€4.50</span></td></tr><tr><td>0.25</td><td><span style="color: green;">€9.30</span></td><td><span style="color: green;">€17.82</span></td></tr><tr><td>0.5</td><td><span style="color: green;">€5.68</span></td><td><span style="color: green;">€21.63</span></td></tr><tr><td>1</td><td><span style="color: green;">€16.87</span></td><td><span style="color: green;">€16.88</span></td></tr><tr><td>2</td><td><span style="color: green;">€2.08</span></td><td><span style="color: green;">€1.61</span></td></tr><tr><td>4</td><td><span style="color: green;">€13.65</span></td><td><span style="color:

## Solution creation
#### Can have 'correct' functions for each instance then perform '==' check using same input parameters as AI generated function to evaluate

In [4]:
dfs = pd.read_html(updated_html)
df = dfs[0]

print(df)
def extract_formula(df, qc, weight, day_night):
    row = df[df[('Noise Charges', 'QC')] == qc]
    if row.empty:
        return None
    
    if day_night.lower() == 'day':
        fee = row[('Noise Charges', 'Set fee per Tonne 2025 Day')].values[0]
    elif day_night.lower() == 'night':
        fee = row[('Noise Charges', 'Set fee per Tonne 2025 Night')].values[0]
    else:
        raise ValueError("day_night must be 'day' or 'night'")
    
    total_fee = float(fee.replace('€', '').replace(',', '.')) * weight
    return total_fee

    
print(extract_formula(df, 1, 10, 'day'))  # Example usage
print(extract_formula(df, 2, 5, 'night'))  # Example usage

  Noise Charges                                                        
             QC Set fee per Tonne 2025 Day Set fee per Tonne 2025 Night
0         0.000                      €2.95                       €11.11
1         0.125                      €7.11                        €4.50
2         0.250                      €9.30                       €17.82
3         0.500                      €5.68                       €21.63
4         1.000                     €16.87                       €16.88
5         2.000                      €2.08                        €1.61
6         4.000                     €13.65                       €12.07
7         8.000                      €8.77                        €3.08
8        16.000                      €8.15                       €19.71
168.70000000000002
8.05


  dfs = pd.read_html(updated_html)


## Code below wont work on notebook - need to connect to compute node

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "codellama/CodeLlama-7b-Python-hf"  # Replace with your model name 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

sys_prompt = '''You are a helpful Python programming assistant. You are given an HTML document that contains a pricing table. Your job is to write clean, readable Python code that defines a function to compute a total fee based on inputs like 'QC', weight in tonnes, and whether it's 'day' or 'night'.

The HTML may contain <th colspan> or <br> tags and style attributes. You should only provide the formula. Read the HTML and use your HTML reading abilities to understand the structure and values of the HTML and use them to make a function'''
#add few shot learning? will need to produce examples
few_shot = f'''Example: For the following HTML table **{updated_html}** you would be expected to provide the following output **def extract_formula(df, qc, weight, day_night):
    row = df[df[('Noise Charges', 'QC')] == qc]
    if row.empty:
        return None
    
    if day_night.lower() == 'day':
        fee = row[('Noise Charges', 'Set fee per Tonne 2025 Day')].values[0]
    elif day_night.lower() == 'night':
        fee = row[('Noise Charges', 'Set fee per Tonne 2025 Night')].values[0]
    else:
        raise ValueError("day_night must be 'day' or 'night'")
    
    total_fee = float(fee.replace('€', '').replace(',', '.')) * weight
    return total_fee**'''
user_prompt = '''
# Below is an HTML table containing noise charge data.
# This table is presented to you as a string for easy reading
# Your task is to write a function `extract_formula(html_text, qc, weight, day_night)` that:
# - Extracts the relevant fee per tonne for a given `qc` (float) and `day_night` ("day" or "night")
# - Multiplies the fee by the given `weight` in tonnes
# - Returns the total fee as a float
# Output only the function definition. Do not include explanatory comments or examples.
'''
full_prompt = f'''
<|system|>
{sys_prompt}
{few_shot}
<|user|>
{user_prompt}
{html}
<|assistant|>'''
inputs = tokenizer(full_prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=50)
formula = tokenizer.batch_decode(outputs[:, inputs['input_ids'].size(1):], skip_special_tokens=True)
formula = tokenizer.batch_decode(outputs, skip_special_tokens=True)
formula
 


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


RuntimeError: unable to mmap 9976570520 bytes from file </gpfs01/home/ppytr13/.cache/huggingface/hub/models--codellama--CodeLlama-7b-Python-hf/snapshots/d4178f5d2eead875e627ec487b23679266319b7f/model-00001-of-00002.safetensors>: Cannot allocate memory (12)