ALL METRIC DISTRIBUTION 50 VS 100

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Datasets
datasets = {
    'Grammar': {
        'GPT': pd.read_csv("gr_gpt_ALL.csv"),
        'Gemini': pd.read_csv("gr_gemini_ALL.csv")
    },
    'Syntax': {
        'GPT': pd.read_csv("synt_gpt_ALL.csv"),
        'Gemini': pd.read_csv("synt_gemini_ALL.csv")
    },
    'Readability': {
        'GPT': pd.read_csv("read_gpt_ALL.csv"),
        'Gemini': pd.read_csv("read_gemini_ALL.csv")
    },
    'InfoDens': {
        'GPT': pd.read_csv("infdens_gpt_ALL.csv"),
        'Gemini': pd.read_csv("infdens_gemini_ALL.csv")
    }
}

metrics = ["acc_median_cosine", "acc_median_fuzzy", 
           "acc_median_final", "comp_semantic_score", "comp_entity_score", 
           "comp_final_median", "key_concept_coverage", "Conciseness"]

for dimension, models in datasets.items():
    for model_name, df in models.items():

        if 'setting' in df.columns:
            split_col = 'setting'
        elif 'percentage' in df.columns:
            split_col = 'percentage'
        else:
            split_col = None  

        split_values = df[split_col].dropna().unique() if split_col else [None]

        for m in metrics:
            fig, axes = plt.subplots(1, len(split_values), figsize=(7*len(split_values), 5), sharey=True)

            if len(split_values) == 1:
                axes = [axes]

            for ax, val in zip(axes, split_values):
                if val is not None:
                    data_to_plot = df[df[split_col] == val]
                    title_suffix = f"{split_col}={val}"
                else:
                    data_to_plot = df
                    title_suffix = ""

                sns.histplot(
                    data=data_to_plot,
                    x=m,
                    hue="manipulation",
                    kde=True,
                    element="step",
                    ax=ax
                )
                ax.set_title(f"{dimension} - {model_name} - {m} ({title_suffix})")

            plt.tight_layout()
  
            plt.show()
            plt.close()


DISTRIBUTIONS all manipulations 50 VS 100 + THRESHOLDS TABLES

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Datasets
datasets = {
    'Grammar': {
        'GPT': pd.read_csv("gr_gpt_ALL.csv"),
        'Gemini': pd.read_csv("gr_gemini_ALL.csv")
    },
    'Syntax': {
        'GPT': pd.read_csv("synt_gpt_ALL.csv"),
        'Gemini': pd.read_csv("synt_gemini_ALL.csv")
    },
    'Readability': {
        'GPT': pd.read_csv("read_gpt_ALL.csv"),
        'Gemini': pd.read_csv("read_gemini_ALL.csv")
    },
    'InfoDens': {
        'GPT': pd.read_csv("infdens_gpt_ALL.csv"),
        'Gemini': pd.read_csv("infdens_gemini_ALL.csv")
    }
}

metrics = ['acc_median_final', 'comp_final_median', 'key_concept_coverage', 'Conciseness']
metric_names = ['Accuracy', 'Completeness', 'KCC', 'Conciseness']
metric_thresholds = {
    'Accuracy': [0.4, 0.6],
    'Completeness': [0.4, 0.6],
    'KCC': [0.4, 0.6],
    'Conciseness': [0.5, 1.5]
}

def collect_all_manipulations(datasets, model_name):
    """Raccoglie tutte le manipolazioni uniche da tutte le dimensioni"""
    all_manips = set()
    for dimension, dfs in datasets.items():
        df = dfs[model_name]
        all_manips.update(df['manipulation'].unique())
    return sorted(all_manips)

def create_manipulation_histogram_grid(datasets, model_name):

    all_manipulations = collect_all_manipulations(datasets, model_name)

    colors = plt.cm.tab10(np.linspace(0, 1, 10))
    color_map = {manip: colors[i % 10] for i, manip in enumerate(all_manipulations)}

    fig, axes = plt.subplots(4, 2, figsize=(24, 22))
    fig.suptitle(f'{model_name} - Distribution by Manipulation (50% vs 100%)',
                 fontsize=18, fontweight='bold', y=0.995)

    legend_elements = [
        plt.Line2D([0], [0], marker='s', markersize=10, color='w', 
                   markerfacecolor=color_map[m], linestyle='', label=m)
        for m in all_manipulations
    ]
    fig.legend(handles=legend_elements, loc="upper center",
               ncol=5, fontsize=11, framealpha=0.95, 
               bbox_to_anchor=(0.5, 0.98),
               columnspacing=1.5, handletextpad=0.5)

    for row_idx, (metric_col, metric_name) in enumerate(zip(metrics, metric_names)):

        if metric_name == 'Conciseness':
            bins = np.linspace(0, 5, 26)
            xlim = [0, 5]
            thresholds = metric_thresholds['Conciseness']
        else:
            bins = np.linspace(0, 1, 21)
            xlim = [0, 1]
            thresholds = metric_thresholds[metric_name]

        for col_idx, pct in enumerate([50, 100]):

            ax = axes[row_idx, col_idx]

            manip_data = {}
            manip_stats = {}

            # ============ Data Extractions ============
            for dimension, dfs in datasets.items():
                df = dfs[model_name]
                if "sentence_percentage" in df.columns:
                    df.rename(columns={"sentence_percentage": "percentage"}, inplace=True)

                if dimension == 'InfoDens':
                    settings = sorted(df['setting'].unique())
                    if col_idx == 0 and len(settings) > 0:
                        target = settings[0]
                    elif col_idx == 1 and len(settings) > 1:
                        target = settings[-1]
                    else:
                        target = settings[0] if len(settings) > 0 else None
                    
                    if target is not None:
                        df_filtered = df[df['setting'] == target]
                else:
                    df_filtered = df[df['percentage'] == pct]

                for manip in df_filtered['manipulation'].unique():
                    values = df_filtered[df_filtered['manipulation'] == manip][metric_col].dropna()
                    if manip not in manip_data:
                        manip_data[manip] = []
                    manip_data[manip].extend(values.tolist())

            # ============ Thresholds Tables ============
            for manip, vals in manip_data.items():
                vals = np.array(vals)
                if len(vals) > 0:
                    below1 = (vals < thresholds[0]).sum() / len(vals) * 100
                    below2 = (vals < thresholds[1]).sum() / len(vals) * 100
                    manip_stats[manip] = [below1, below2]

            # ============ Plot ============
            for manip, vals in manip_data.items():
                if len(vals) > 0:
                    ax.hist(vals, bins=bins, alpha=0.5,
                            color=color_map[manip],
                            edgecolor='white', linewidth=0.3)

            ax.set_xlim(xlim)
            ax.grid(axis='y', alpha=0.3)

            ax.axvline(thresholds[0], color='red', linestyle='--', alpha=0.5, linewidth=2.5)
            ax.axvline(thresholds[1], color='orange', linestyle='--', alpha=0.5, linewidth=2.5)

            if row_idx == 3:
                ax.set_xlabel('Value', fontsize=12, fontweight='bold')
            ax.set_ylabel('Count', fontsize=12, fontweight='bold')

            if row_idx == 0:
                ax.set_title('50% Corruption' if col_idx == 0 else '100% Corruption',
                             fontsize=14, fontweight='bold')

            if col_idx == 0:
                ax.text(-0.22, 0.5, metric_name, transform=ax.transAxes,
                        fontsize=15, fontweight='bold', rotation=90,
                        verticalalignment='center')

            table_data = []
            
            header_row = ["Manipulation", f"< {thresholds[0]:.1f}", f"< {thresholds[1]:.1f}"]
            
            for manip in sorted(manip_stats.keys()):
                t1, t2 = manip_stats[manip]
                table_data.append([manip, f"{t1:.1f}%", f"{t2:.1f}%"])

            table = ax.table(
                cellText=table_data,
                cellLoc='center',
                colLoc='center',
                bbox=[1.05, 0, 0.6, 1],
                edges='closed' 
            )
            
            table.auto_set_font_size(False)
            table.set_fontsize(9)
            table.scale(1, 1.8)
            
            header_y = 1.02 
            
            ax.text(1.05 + 0.6*0.225, header_y, "Manipulation", 
                   transform=ax.transAxes, fontsize=10, fontweight='bold',
                   ha='center', va='bottom',
                   bbox=dict(boxstyle='round,pad=0.5', facecolor='#2c3e50', 
                            edgecolor='white', linewidth=2),
                   color='white')
            
            ax.text(1.05 + 0.6*0.625, header_y, f"< {thresholds[0]:.1f}", 
                   transform=ax.transAxes, fontsize=10, fontweight='bold',
                   ha='center', va='bottom',
                   bbox=dict(boxstyle='round,pad=0.5', facecolor='#2c3e50', 
                            edgecolor='white', linewidth=2),
                   color='white')
            
            ax.text(1.05 + 0.6*0.875, header_y, f"< {thresholds[1]:.1f}", 
                   transform=ax.transAxes, fontsize=10, fontweight='bold',
                   ha='center', va='bottom',
                   bbox=dict(boxstyle='round,pad=0.5', facecolor='#2c3e50', 
                            edgecolor='white', linewidth=2),
                   color='white')
            
            for i in range(len(table_data)): 
                cell = table[(i, 0)]
                cell.set_width(0.45)
                cell.set_text_props(ha='left')
                
                table[(i, 1)].set_width(0.25)
                table[(i, 2)].set_width(0.25)
            
            for i in range(len(table_data)):
                for j in range(3):
                    cell = table[(i, j)]
                    
                    if i % 2 == 0:
                        cell.set_facecolor('#ecf0f1')
                    else:
                        cell.set_facecolor('white')

                    cell.set_edgecolor('#bdc3c7')
                    cell.set_linewidth(1)

                    if j == 1:
                        val = float(table_data[i][1].strip('%'))
                        if val > 20:
                            cell.set_text_props(weight='bold', color='darkred')
                        elif val > 10:
                            cell.set_text_props(color='red')
                    
                    if j == 2:
                        val = float(table_data[i][2].strip('%'))
                        if val > 40:
                            cell.set_text_props(weight='bold', color='darkorange')
                        elif val > 25:
                            cell.set_text_props(color='orange')

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    return fig


# Generate for both Models
for model_name in ['GPT', 'Gemini']:
    print(f"{model_name}...")
    
    fig = create_manipulation_histogram_grid(datasets, model_name)
    plt.show(fig)
    plt.close(fig)

MANIPULATED VS BASELINE

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Datasets
datasets = {
    'Grammar': {
        'GPT': pd.read_csv("gr_gpt_ALL.csv"),
        'Gemini': pd.read_csv("gr_gemini_ALL.csv")
    },
    'Syntax': {
        'GPT': pd.read_csv("synt_gpt_ALL.csv"),
        'Gemini': pd.read_csv("synt_gemini_ALL.csv")
    },
    'Readability': {
        'GPT': pd.read_csv("read_gpt_ALL.csv"),
        'Gemini': pd.read_csv("read_gemini_ALL.csv")
    },
    'InfoDens': {
        'GPT': pd.read_csv("infdens_gpt_ALL.csv"),
        'Gemini': pd.read_csv("infdens_gemini_ALL.csv")
    }
}

# Metrics
metrics = ["acc_median_cosine", "acc_median_fuzzy", 
           "acc_median_final", "comp_semantic_score", "comp_entity_score", 
           "comp_final_median", "key_concept_coverage", "Conciseness"]

# Baseline
baseline_files = {
    'GPT': pd.read_csv("clapnqans_openai_answers_NEW.csv").rename(columns={"Text_id": "test_id"}),
    'Gemini': pd.read_csv("clapnqans_gemini_answers_FIXED_NEW.csv").rename(columns={"Text_id": "test_id"})
}

for dimension, models in datasets.items():
    for model_name, df in models.items():
        print(f"\n=== {dimension} - {model_name} ===")
        
        if 'setting' in df.columns:
            split_col = 'setting'
            split_values = df[split_col].dropna().unique()
        elif 'percentage' in df.columns:
            split_col = 'percentage'
            split_values = [50, 100]
        else:
            split_col = None
            split_values = [None]

        for m in metrics:
            df[m] = pd.to_numeric(df[m], errors='coerce')
            baseline_df = baseline_files[model_name]
            baseline_df[m] = pd.to_numeric(baseline_df[m], errors='coerce')
            baseline_median = baseline_df[m].median()

            manipulations = df['manipulation'].unique()

            delta_table = pd.DataFrame(index=manipulations)

            for val in split_values:
                if val is not None:
                    df_sub = df[df[split_col] == val]
                    col_name = f"Δ%_{val}"
                else:
                    df_sub = df
                    col_name = f"Δ%"

                median_vals = df_sub.groupby('manipulation')[m].median()
                delta_table[col_name] = ((median_vals - baseline_median) / baseline_median) * 100

            delta_table["baseline"] = baseline_median

            # --- Table ---
            print(f"\nMetric: {m} - {dimension} - {model_name}")
            print(delta_table)

            # --- Plot ---
            plt.figure(figsize=(10, 5))
            width = 0.35
            x = np.arange(len(delta_table))

            if len(split_values) == 2:
                plt.bar([i - width/2 for i in x], delta_table[f"Δ%_{split_values[0]}"], width, label=str(split_values[0]))
                plt.bar([i + width/2 for i in x], delta_table[f"Δ%_{split_values[1]}"], width, label=str(split_values[1]))
            else:
                plt.bar(x, delta_table.iloc[:,0], width, label=str(split_values[0]))

            plt.axhline(0, color="black", linestyle="--", linewidth=0.7)
            plt.xticks(x, delta_table.index, rotation=45, ha='right')
            plt.ylabel("Δ% vs baseline")
            plt.title(f"{dimension} - {model_name} - {m}")
            plt.legend(title=split_col)
            plt.tight_layout()
       
            plt.show()
            plt.close()


GPT VS GEMINI

Gpt vs Gemini: Semantic Metrics

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# === FILE GPT ===
file_openai = {
    "GR": "gr_gpt_ALL.csv",
    "SYNT": "synt_gpt_ALL.csv",
    "READ": "read_gpt_ALL.csv",
    "INFDENS": "infdens_gpt_ALL.csv",
}

# === FILE GEMINI ===
file_gemini = {
    "GR": "gr_gemini_ALL.csv",
    "SYNT": "synt_gemini_ALL.csv",
    "READ": "read_gemini_ALL.csv",
    "INFDENS": "infdens_gemini_ALL.csv",
}

# === FILE BASELINE ===
baseline_openai = "clapnqans_openai_answers_NEW.csv"
baseline_gemini = "clapnqans_gemini_answers_FIXED_NEW.csv"

# === METRICS  ===
metrics = [
    "acc_median_final",
    "comp_final_median",
    "key_concept_coverage",
    "Conciseness"
]

metric_names = {
    "acc_median_final": "Accuracy",
    "comp_final_median": "Completeness",
    "key_concept_coverage": "KCC",
    "Conciseness": "Conciseness"
}

def prepare_df(df):
    if "sentence_percentage" in df.columns:
        df.rename(columns={"sentence_percentage": "percentage"}, inplace=True)
    if "setting" in df.columns:
        mapping = {"2s_3e": 50, "4s_3e": 100}
        df["percentage"] = df["setting"].map(mapping)
    return df

def load_all(files_dict):
    dfs = []
    for name, path in files_dict.items():
        df = pd.read_csv(path, encoding="utf-8")
        df = prepare_df(df)
        df["source"] = name
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

df_openai = load_all(file_openai)
df_gemini = load_all(file_gemini)

df_baseline_openai = pd.read_csv(baseline_openai, encoding="utf-8")
df_baseline_gemini = pd.read_csv(baseline_gemini, encoding="utf-8")

for m in metrics:
    df_openai[m] = pd.to_numeric(df_openai[m], errors="coerce")
    df_gemini[m] = pd.to_numeric(df_gemini[m], errors="coerce")
    df_baseline_openai[m] = pd.to_numeric(df_baseline_openai[m], errors="coerce")
    df_baseline_gemini[m] = pd.to_numeric(df_baseline_gemini[m], errors="coerce")

# === COMPUTE MEDIAN BASELINE ===
baseline_medians_openai = {m: df_baseline_openai[m].median() for m in metrics}
baseline_medians_gemini = {m: df_baseline_gemini[m].median() for m in metrics}

diff_tables = {}

for m in metrics:
    # === OPENAI (GPT) ===
    gpt_table = df_openai.groupby(["manipulation", "percentage"])[m].median().unstack()
    gpt_table["baseline_gpt"] = baseline_medians_openai[m]
    for col in [50, 100]:
        if col in gpt_table.columns:
            gpt_table[f"ΔGPT_{col}"] = gpt_table[col] - gpt_table["baseline_gpt"]
        else:
            gpt_table[f"ΔGPT_{col}"] = np.nan

    # === GEMINI ===
    gemini_table = df_gemini.groupby(["manipulation", "percentage"])[m].median().unstack()
    gemini_table["baseline_gemini"] = baseline_medians_gemini[m]
    for col in [50, 100]:
        if col in gemini_table.columns:
            gemini_table[f"ΔGEMINI_{col}"] = gemini_table[col] - gemini_table["baseline_gemini"]
        else:
            gemini_table[f"ΔGEMINI_{col}"] = np.nan

    # === MERGE GPT + GEMINI ===
    merged = pd.DataFrame(index=gpt_table.index)
    for col in [50, 100]:
        merged[f"ΔGPT_{col}"] = gpt_table.get(f"ΔGPT_{col}", np.nan)
        merged[f"ΔGEMINI_{col}"] = gemini_table.get(f"ΔGEMINI_{col}", np.nan)
        merged[f"Δdiff_{col}"] = merged[f"ΔGPT_{col}"] - merged[f"ΔGEMINI_{col}"]

    merged["baseline_gpt"] = baseline_medians_openai[m]
    merged["baseline_gemini"] = baseline_medians_gemini[m]

    diff_tables[m] = merged
    
    # === TABLE ===
    print(f"\n{'='*100}")
    print(f"=== {metric_names[m]} - GPT vs GEMINI ===")
    print(f"{'='*100}")
    print(f"Baseline GPT: {baseline_medians_openai[m]:.4f} | Baseline Gemini: {baseline_medians_gemini[m]:.4f}")
    print("\nΔdiff = ΔGPT - ΔGEMINI (positive → GPT degrades more, negative → Gemini degrades more)")
    print(merged[["ΔGPT_50", "ΔGEMINI_50", "Δdiff_50", "ΔGPT_100", "ΔGEMINI_100", "Δdiff_100"]].to_string())

# === PLOT ===
fig, axes = plt.subplots(2, 2, figsize=(18, 14))
fig.suptitle('GPT vs Gemini: Difference in Degradation from Baseline\n' + 
                'Δdiff = ΔGPT - ΔGEMINI (Positive = GPT degrades more | Negative = Gemini degrades more)',
            fontsize=18, fontweight='bold', y=0.995)

for idx, m in enumerate(metrics):
    row = idx // 2
    col = idx % 2
    ax = axes[row, col]
    
    merged = diff_tables[m]
    metric_label = metric_names[m]
    
    manipulations = merged.index.tolist()
    x = np.arange(len(manipulations))
    width = 0.35
    
    values_50 = merged["Δdiff_50"].values
    values_100 = merged["Δdiff_100"].values
    
    # Plot
    bars1 = ax.bar(x - width/2, values_50, width, label='50% Corruption', 
                   color='steelblue', edgecolor='black', linewidth=0.7, alpha=0.85)
    bars2 = ax.bar(x + width/2, values_100, width, label='100% Corruption',
                   color='coral', edgecolor='black', linewidth=0.7, alpha=0.85)
    
    # Baseline zero
    ax.axhline(0, color='black', linestyle='--', linewidth=2, alpha=0.7)

    ax.set_xlabel('Manipulation', fontsize=13, fontweight='bold')
    ax.set_ylabel('ΔGPT - ΔGEMINI', fontsize=13, fontweight='bold')
    ax.set_title(f'{metric_label}\nBaseline → GPT: {merged["baseline_gpt"].iloc[0]:.3f} | ' +
                f'Gemini: {merged["baseline_gemini"].iloc[0]:.3f}',
                fontsize=14, fontweight='bold', pad=10)
    ax.set_xticks(x)
    ax.set_xticklabels(manipulations, rotation=45, ha='right', fontsize=10)
    ax.legend(fontsize=11, loc='best', framealpha=0.9)
    ax.grid(axis='y', alpha=0.3, linestyle=':', linewidth=0.8)
    ax.tick_params(axis='y', labelsize=11)
    ax.tick_params(axis='x', labelsize=10)
    
    # Add Values
    for bar in bars1:
        height = bar.get_height()
        if abs(height) > 0.00:
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.2f}', ha='center', 
                   va='bottom' if height > 0 else 'top',
                   fontsize=8, fontweight='bold',
                   bbox=dict(boxstyle='round,pad=0.3', facecolor='white', 
                            edgecolor='steelblue', alpha=0.7, linewidth=0.5))
    
    for bar in bars2:
        height = bar.get_height()
        if abs(height) > 0.00:
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.2f}', ha='center', 
                   va='bottom' if height > 0 else 'top',
                   fontsize=8, fontweight='bold',
                   bbox=dict(boxstyle='round,pad=0.3', facecolor='white', 
                            edgecolor='coral', alpha=0.7, linewidth=0.5))

plt.tight_layout(rect=[0, 0, 1, 0.98])

plt.show()
plt.close()


Gpt vs Gemini: NumAdditions & NumReplicateErrors

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt


# === FILE GPT ===
file_openai = {
    "GR": "gr_gpt_ALL.csv",
    "SYNT": "synt_gpt_ALL.csv",
    "READ": "read_gpt_ALL.csv",
    "INFDENS": "infdens_gpt_ALL.csv",
}

# === FILE GEMINI ===
file_gemini = {
    "GR": "gr_gemini_ALL.csv",
    "SYNT": "synt_gemini_ALL.csv",
    "READ": "read_gemini_ALL.csv",
    "INFDENS": "infdens_gemini_ALL.csv",
}

# === FILE BASELINE ===
baseline_openai = "clapnqans_openai_answers_NEW.csv"
baseline_gemini = "clapnqans_gemini_answers_FIXED_NEW.csv"

# === METRICS ===
metrics = ["Num_addition", "Num_direct_modifications"]

def prepare_df(df):
    if "sentence_percentage" in df.columns:
        df.rename(columns={"sentence_percentage": "percentage"}, inplace=True)
    if "setting" in df.columns:
        mapping = {"2s_3e": 50, "4s_3e": 100}
        df["percentage"] = df["setting"].map(mapping)
    return df

def load_all(files_dict):
    dfs = []
    for name, path in files_dict.items():
        df = pd.read_csv(path, encoding="utf-8")
        df = prepare_df(df)
        df["source"] = name
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

df_gpt = load_all(file_openai)
df_gemini = load_all(file_gemini)
df_base_gpt = prepare_df(pd.read_csv(baseline_openai, encoding="utf-8"))
df_base_gemini = prepare_df(pd.read_csv(baseline_gemini, encoding="utf-8"))

for df in [df_gpt, df_gemini, df_base_gpt, df_base_gemini]:
    for m in metrics:
        df[m] = pd.to_numeric(df[m], errors="coerce")

# === COMPUTE % NOT ZERO BASELINE ===
baseline_nonzero_gpt = {m: (df_base_gpt[m].ne(0).sum() / len(df_base_gpt)) * 100 for m in metrics}
baseline_nonzero_gemini = {m: (df_base_gemini[m].ne(0).sum() / len(df_base_gemini)) * 100 for m in metrics}

diff_tables = {}

for m in metrics:
    # === GPT ===
    gpt_table = (
        df_gpt.groupby(["manipulation", "percentage"])[m]
        .apply(lambda x: (x.ne(0).sum() / len(x)) * 100)
        .unstack()
    )
    gpt_table["baseline_gpt"] = baseline_nonzero_gpt[m]
    for col in [50, 100]:
        gpt_table[f"ΔGPT_{col}"] = gpt_table[col] - gpt_table["baseline_gpt"]

    # === GEMINI ===
    gemini_table = (
        df_gemini.groupby(["manipulation", "percentage"])[m]
        .apply(lambda x: (x.ne(0).sum() / len(x)) * 100)
        .unstack()
    )
    gemini_table["baseline_gemini"] = baseline_nonzero_gemini[m]
    for col in [50, 100]:
        gemini_table[f"ΔGEMINI_{col}"] = gemini_table[col] - gemini_table["baseline_gemini"]

    # === MERGE GPT + GEMINI ===
    merged = pd.DataFrame(index=gpt_table.index)
    for col in [50, 100]:
        merged[f"GPT_{col}"] = gpt_table[col]
        merged[f"GEMINI_{col}"] = gemini_table[col]
        merged[f"ΔGPT_{col}"] = gpt_table[f"ΔGPT_{col}"]
        merged[f"ΔGEMINI_{col}"] = gemini_table[f"ΔGEMINI_{col}"]
        merged[f"Δdiff_{col}"] = merged[f"ΔGPT_{col}"] - merged[f"ΔGEMINI_{col}"]

    merged["baseline_gpt"] = gpt_table["baseline_gpt"].iloc[0]
    merged["baseline_gemini"] = gemini_table["baseline_gemini"].iloc[0]

    diff_tables[m] = merged

    # === TABLE ===
    print(f"\n=== GPT vs GEMINI (non-zero %) per {m} ===")
    print(merged[[f"GPT_50", f"GEMINI_50", f"ΔGPT_50", f"ΔGEMINI_50", f"Δdiff_50",
                  f"GPT_100", f"GEMINI_100", f"ΔGPT_100", f"ΔGEMINI_100", f"Δdiff_100",
                  "baseline_gpt", "baseline_gemini"]])

    # === PLOT ===
    plt.figure(figsize=(8, 5))
    width = 0.35
    x = range(len(merged.index))

    plt.bar([i - width/2 for i in x], merged["Δdiff_50"], width, 
            label="Δdiff 50%", color="steelblue")

    plt.bar([i + width/2 for i in x], merged["Δdiff_100"], width, 
            label="Δdiff 100%", color="coral")


    plt.axhline(0, color="black", linestyle="--", linewidth=0.7)
    plt.xticks(x, merged.index, rotation=45)
    plt.ylabel("ΔGPT - ΔGEMINI")
    if m == "Num_addition":
        plt.title(f" % {m} != 0")
    else:
        plt.title(f"% Num_replicate_errors != 0")
    plt.legend()
    plt.tight_layout()

    plt.show()
    plt.close()
