In [59]:
import os
import re
import pandas as pd
import plotly.graph_objects as go

# === Parametri di soglia ===
ACCURACY_THRESHOLD = 97.5
ACCURACY_THRESHOLD = 98 #zoomed
COMPRESSION_THRESHOLD = 3.5

# === Funzione per estrarre dati da un singolo file, inclusa intestazione params ===
def extract_metrics_from_file(filepath):
    results = []
    with open(filepath, 'r') as f:
        lines = f.readlines()

    # === Estrarre intestazione ===
    header_params = {}
    header_end_idx = None
    for i, line in enumerate(lines):
        if line.strip() == "------------------------------------------------------------":
            header_end_idx = i
            break

    if header_end_idx is not None:
        for line in lines[:header_end_idx]:
            line = line.strip()
            if '=' in line and not line.startswith('='):
                key, val = line.split('=', 1)
                header_params[key.strip()] = val.strip()

    current_var_name = None
    current_var_value = None

    for i, line in enumerate(lines):
        # Rileva blocco separatore
        if "------------------------------------------------------------" in line:
            # Estrai intestazione variabile nella riga successiva
            if i + 1 < len(lines):
                header_line = lines[i + 1].strip()
                match_header = re.match(r"([a-zA-Z_]+)\s*=\s*([0-9.]+)", header_line)
                if match_header:
                    current_var_name = match_header.group(1)
                    current_var_value = float(match_header.group(2))

        # === Caso 1: Riga con A_Q e zstd_ratio ===
        match_full_line = re.search(r"A_Q\s*=\s*([\d.]+).*?zstd_ratio\s*=\s*([\d.]+)%", line)
        if match_full_line:
            acc = float(match_full_line.group(1))
            ratio = float(match_full_line.group(2))
            if acc > ACCURACY_THRESHOLD and ratio < COMPRESSION_THRESHOLD:
                results.append({
                    'accuracy': acc,
                    'zstd_ratio': ratio,
                    'source': os.path.basename(filepath),
                    'var_name': current_var_name,
                    'var_value': current_var_value,
                    'header_params': header_params
                })
            continue

        # === Caso 2: Riga con 'to' e riga successiva con zstd_ratio ===
        if "to" in line:
            match_to = re.search(r'to\s+([\d.]+)', line)
            if match_to and i + 1 < len(lines):
                acc = float(match_to.group(1))
                next_line = lines[i + 1]
                match_ratio = re.search(r'zstd_ratio\s*=\s*([\d.]+)%', next_line)
                if match_ratio:
                    ratio = float(match_ratio.group(1))
                    if acc > ACCURACY_THRESHOLD and ratio < COMPRESSION_THRESHOLD:
                        results.append({
                            'accuracy': acc,
                            'zstd_ratio': ratio,
                            'source': os.path.basename(filepath),
                            'var_name': current_var_name,
                            'var_value': current_var_value,
                            'header_params': header_params
                        })
    return results

# === Lista e ordinamento file ===
file_list = [f for f in os.listdir('.') if f.startswith('output-Test-2025-')]
file_list.sort(key=lambda f: os.path.getmtime(f))
last_two_files = file_list[-2:] if len(file_list) >= 2 else file_list

# === Estrazione dati da tutti i file ===
all_data = []
for filename in file_list:
    all_data.extend(extract_metrics_from_file(filename))

df = pd.DataFrame(all_data)

# Flagga punti provenienti dagli ultimi 2 file
df['is_recent_file'] = df['source'].apply(lambda x: x in last_two_files)

# === Funzione per convertire dict header_params in stringa leggibile per tooltip ===
def header_params_to_str(params):
    return '<br>'.join([f"{k}={v}" for k, v in params.items()])

# Crea etichetta tooltip completa
df['label'] = df.apply(lambda row: 
    f"File: {row['source']}<br>"
    f"{row['var_name']} = {row['var_value']}<br>"
    f"Accuracy: {row['accuracy']}<br>"
    f"Compression: {row['zstd_ratio']}%<br>"
    f"<b>Params:</b><br>{header_params_to_str(row['header_params'])}", axis=1)

# === Calcola frontiera efficiente ===
def efficient_frontier(df):
    df_sorted = df.sort_values(by=['zstd_ratio', 'accuracy'], ascending=[True, False])
    frontier_rows = []
    max_acc = -1
    for _, row in df_sorted.iterrows():
        if row['accuracy'] > max_acc:
            frontier_rows.append(row)
            max_acc = row['accuracy']
    return pd.DataFrame(frontier_rows)

frontier_df = efficient_frontier(df)

# Etichetta per frontiera efficiente
frontier_df['label'] = frontier_df.apply(lambda row: 
    f"File: {row['source']}<br>"
    f"{row['var_name']} = {row['var_value']}<br>"
    f"Accuracy: {row['accuracy']}<br>"
    f"Compression: {row['zstd_ratio']}%<br>"
    f"<b>Params:</b><br>{header_params_to_str(row['header_params'])}", axis=1)

# Flag frontiera nel df principale
df['is_frontier'] = df.index.isin(frontier_df.index)

# === Separazione punti per colori e gruppi ===
recent_points = df[df['is_recent_file']]
blue_points = df[(~df['is_recent_file']) & (~df['is_frontier'])]
red_points = frontier_df

# === Plot con plotly ===
trace_blue = go.Scatter(
    x=blue_points['accuracy'],
    y=blue_points['zstd_ratio'],
    mode='markers',
    name='Unefficient points',
    marker=dict(size=8, color='blue'),
    text=blue_points['label'],
    hoverinfo='text'
)

trace_red = go.Scatter(
    x=red_points['accuracy'],
    y=red_points['zstd_ratio'],
    mode='markers+lines',
    name='Pareto Frontier',
    line=dict(color='red', width=2, dash='dash'),
    marker=dict(size=10, symbol='diamond'),
    text=red_points['label'],
    hoverinfo='text'
)

trace_yellow = go.Scatter(
    x=recent_points['accuracy'],
    y=recent_points['zstd_ratio'],
    mode='markers',
    name='Last two tests',
    marker=dict(size=9, color='yellow', line=dict(color='black', width=1)),
    text=recent_points['label'],
    hoverinfo='text'
)

fig = go.Figure(data=[trace_blue, trace_red, trace_yellow])

fig.update_layout(
    title=f'Pareto Frontier [ACCURACY_THRESHOLD={ACCURACY_THRESHOLD}, COMPRESSION_THRESHOLD={COMPRESSION_THRESHOLD}]',
    xaxis_title='Accuracy (%)',
    yaxis_title='Compression Ratio (%)',
    template='plotly_white'
)

fig.show()


In [60]:
fig.write_html("ParetoFrontierZoomed.html", include_plotlyjs='embed', full_html=True)

In [58]:
fig.write_html("ParetoFrontier.html", include_plotlyjs='embed', full_html=True)