In [6]:
import re
import pandas as pd
import numpy as np

# Configuration mapping specific to your file structure
FILE_MAP = {
    'FNO': '/srv/scratch/z5370003/projects/results/04_groundwater/variable_density/FNO_predictions/forcing/fno_predictions_20260117_134440/metrics.txt',
    'GINO': '/srv/scratch/z5370003/projects/results/04_groundwater/variable_density/GINO_predictions/forcing_standard_loss/gino_predictions_20260117_131908/metrics.txt',
    'GINO-VL': '/srv/scratch/z5370003/projects/results/04_groundwater/variable_density/GINO_predictions/forcing/gino_predictions_20260117_131833/metrics.txt'
}

# Mapping raw text file variable names to Table names
VAR_MAP = {
    'head': 'Hydraulic Head',
    'mass_concentration': 'Mass Concentration'
}

In [7]:
def extract_section(content, start_marker, end_marker=None):
    """Extracts a block of text between two markers."""
    try:
        start_idx = content.index(start_marker)
        if end_marker:
            end_idx = content.index(end_marker, start_idx)
            return content[start_idx:end_idx]
        return content[start_idx:]
    except ValueError:
        return ""

def get_metric_value(text_block, metric_name):
    """Finds a float value following a specific metric label using Regex."""
    # Pattern looks for the metric name followed by : and whitespace, then a number
    pattern = rf"{metric_name}:\s+([0-9.]+)"
    match = re.search(pattern, text_block)
    if match:
        return float(match.group(1))
    return None

def parse_model_file(filepath):
    """Reads a single file and extracts L2, R2, and KGE for Train and Val."""
    with open(filepath, 'r') as f:
        content = f.read()

    data = {}
    
    # Split into Train and Val (Test) sections
    # Note: 'VAL SET' is used in the text file, but maps to 'test' in the table
    train_section = extract_section(content, "TRAIN SET", "VAL SET")
    val_section = extract_section(content, "VAL SET", "Metric Interpretation")

    # The variables to look for in the text file
    variables = ['head', 'mass_concentration']
    
    for var in variables:
        # We need to isolate the text block for this specific variable
        # This is a simple split strategy; specific regex might be safer if order changes
        # But based on your file, var name ends with ':'
        
        var_clean_name = VAR_MAP[var]
        data[var_clean_name] = {}
        
        for split_name, split_text in [('train', train_section), ('test', val_section)]:
            # Find the start of the variable block inside the split text
            # We look for "head:" or "mass_concentration:"
            try:
                # Find where this variable starts
                var_start = split_text.index(f"{var}:")
                # Find where the next variable starts (or end of string) to limit scope
                # A simple way is to take a chunk of text after the variable name
                # sufficient to cover the metrics (e.g., 500 chars)
                relevant_text = split_text[var_start : var_start + 1000]
            except ValueError:
                relevant_text = ""

            data[var_clean_name][split_name] = {
                'l2': get_metric_value(relevant_text, "Relative L2 Error"),
                'r2': get_metric_value(relevant_text, "RÂ² Score"),
                'kge': get_metric_value(relevant_text, "KGE"),
            }
            
    return data

In [11]:
def aggregate_data(file_map):
    """Parses all files and structures data for easy table generation."""
    full_results = {}
    for model_name, filepath in file_map.items():
        try:
            full_results[model_name] = parse_model_file(filepath)
        except FileNotFoundError:
            print(f"Warning: File {filepath} not found.")
            
    return full_results

def get_best_values(full_results):
    """Determines the best value for each metric/variable/split across models."""
    best = {} # Structure: best[variable][split][metric] = value
    
    variables = ['Hydraulic Head', 'Mass Concentration']
    splits = ['train', 'test']
    metrics = ['l2', 'r2', 'kge']
    
    for var in variables:
        best[var] = {}
        for split in splits:
            best[var][split] = {}
            for metric in metrics:
                values = []
                for model in full_results:
                    val = full_results[model][var][split][metric]
                    if val is not None:
                        values.append(val)
                
                if not values:
                    continue
                    
                # L2 Error: Lower is better
                if metric == 'l2':
                    best[var][split][metric] = min(values)
                # R2 and KGE: Higher is better (closer to 1.0)
                else:
                    best[var][split][metric] = max(values)
    return best

In [16]:
def generate_latex_table(full_results, best_values):
    """Generates the LaTeX code with bolding and formatting."""
    
    header = r"""\begin{table}[ht]
    \centering
    \caption{Performance metrics comparison.}
    \label{tab:performance_auto}
    \begin{tabular}{llcccc%cc
    r}
        \hline
        \textbf{Variable} & \textbf{Model} & \multicolumn{2}{c}{$\ell_2$ Error} & % \multicolumn{2}{c}{$R^2$ Score} & 
        \multicolumn{2}{c}{KGE} & \multirow{2}{*}{Time (hr)}\\
        & & train & test & % train & test & 
        train & test\\
        \hline"""
    
    footer = r"""
        \hline
    \end{tabular}
\end{table}"""
    
    body = ""
    
    # Order: Head first, then Concentration
    variables = ['Hydraulic Head', 'Mass Concentration']
    models = ['FNO', 'GINO', 'GINO-VL']
    
    for var in variables:
        # Multirow for Variable name
        body += f"        \\multirow{{3}}{{*}}{{{var}}} "
        
        for i, model in enumerate(models):
            # For subsequent rows in the same group, don't repeat variable column
            if i > 0:
                body += "         "
            
            body += f"& {model} "
            
            # Extract metrics
            try:
                metrics = full_results[model][var]
                
                # Format function helper
                def fmt(val, split, metric_type):
                    if val is None: return "-"
                    
                    is_best = False
                    target_best = best_values[var][split][metric_type]
                    # Compare with slight tolerance for float equality
                    if abs(val - target_best) < 1e-5:
                        is_best = True
                        
                    str_val = f"{val:.4f}"
                    return f"\\textbf{{{str_val}}}" if is_best else str_val

                l2_train = fmt(metrics['train']['l2'], 'train', 'l2')
                l2_test  = fmt(metrics['test']['l2'], 'test', 'l2')
                r2_train = fmt(metrics['train']['r2'], 'train', 'r2')
                r2_test  = fmt(metrics['test']['r2'], 'test', 'r2')
                kge_train = fmt(metrics['train']['kge'], 'train', 'kge')
                kge_test  = fmt(metrics['test']['kge'], 'test', 'kge')
                
                # Construct Row
                # Note: R2 columns are commented out with %
                body += f"& {l2_train} & {l2_test} & % {r2_train} & {r2_test} & \n"
                body += f"         {kge_train} & {kge_test} & \\\\"
                
            except KeyError:
                body += "& - & - & % - & - & \n         - & - & \\\\"
            
            body += "\n"
        
        body += "        \\hline\n"

    print(header)
    print(body.rstrip()) # rstrip to remove trailing newline
    print(footer)

In [17]:
# --- EXECUTION ---
# 1. Parse
results = aggregate_data(FILE_MAP)

# 2. Calculate Bests
bests = get_best_values(results)

# 3. Print Latex
generate_latex_table(results, bests)

\begin{table}[ht]
    \centering
    \caption{Performance metrics comparison.}
    \label{tab:performance_auto}
    \begin{tabular}{llcccc%cc
    r}
        \hline
        \textbf{Variable} & \textbf{Model} & \multicolumn{2}{c}{$\ell_2$ Error} & % \multicolumn{2}{c}{$R^2$ Score} & 
        \multicolumn{2}{c}{KGE} & \multirow{2}{*}{Time (hr)}\\
        & & train & test & % train & test & 
        train & test\\
        \hline
        \multirow{3}{*}{Hydraulic Head} & FNO & \textbf{0.0503} & 0.0873 & % \textbf{0.9920} & 0.9732 & 
         0.9768 & 0.9626 & \\
         & GINO & 0.0600 & 0.0840 & % 0.9886 & 0.9752 & 
         \textbf{0.9942} & 0.9853 & \\
         & GINO-VL & 0.0588 & \textbf{0.0823} & % 0.9890 & \textbf{0.9762} & 
         0.9918 & \textbf{0.9879} & \\
        \hline
        \multirow{3}{*}{Mass Concentration} & FNO & \textbf{0.0428} & \textbf{0.0584} & % \textbf{0.9956} & \textbf{0.9917} & 
         \textbf{0.9955} & 0.9893 & \\
         & GINO & 0.0701 & 0.0739 & % 0.98