**Imports**
---

In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import Levenshtein
import os

**Test**
---

In [2]:
# Example sentences
s1 = "Sort a list of numbers in ascending order."
s2 = "Arrange numbers from smallest to largest."
# s2 = "Sort a list of numbers in ascending order."

s1 = "Use specific strategies to embed watermarks such as incorporating rare or uncommon phrases related to the stock market and employee motivation, like 'Stock Market Motivation Boost' or 'Capital Growth Employee Engagement'. Ensure these watermarks are evenly distributed throughout the output."
s2 = "Use a structural strategy to embed watermarks, modifying sentence structure subtly but detectably. Incorporate watermarks as parenthetical statements or phrases that blend seamlessly with the content. Ensure watermarks are evenly distributed throughout the output."

# --- BLEU ---
smooth = SmoothingFunction().method1
bleu = sentence_bleu([s1.split()], s2.split(), smoothing_function=smooth)

# --- ROUGE-L ---
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l = rouge.score(s1, s2)['rougeL'].fmeasure

# --- Levenshtein ---
max_len = max(len(s1), len(s2))
lev = 1 - Levenshtein.distance(s1, s2) / max_len if max_len > 0 else 1.0

# --- TF-IDF Cosine ---
tfidf = TfidfVectorizer().fit([s1, s2])
vecs = tfidf.transform([s1, s2])
cosine = cosine_similarity(vecs[0], vecs[1])[0][0]

# --- Output ---
print(f"BLEU Score: {bleu:.4f}")
print(f"ROUGE-L F1: {rouge_l:.4f}")
print(f"Levenshtein Similarity: {lev:.4f}")
print(f"TF-IDF Cosine Similarity: {cosine:.4f}")

BLEU Score: 0.1775
ROUGE-L F1: 0.4595
Levenshtein Similarity: 0.4570
TF-IDF Cosine Similarity: 0.3455


**Config**
---

In [3]:
# ---------- Configuration ----------
NUM_ROWS = 300  # change this value to control the number of sampled rows
CSV_FILES = ["1.csv", "2.csv", "3.csv"]
SEED = 42

if not os.path.exists("ablation/"):
    os.makedirs("ablation/")
# -----------------------------------

**Intra**
---

In [4]:
def load_data(data_dir, filenames):
    dfs = [pd.read_csv(os.path.join(data_dir, f), encoding="ISO-8859-1") for f in filenames]
    return pd.concat(dfs, ignore_index=True)

def sample_texts(df, column, n, seed=42):
    sampled_df = df.sample(n=n, random_state=seed).reset_index(drop=True)
    return sampled_df[column].tolist(), [f"Row {i}" for i in range(n)]

def tfidf_cosine(texts):
    tfidf = TfidfVectorizer().fit_transform(texts)
    return cosine_similarity(tfidf)

def bleu_score_matrix(texts):
    smooth = SmoothingFunction().method1
    n = len(texts)
    matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i == j:
                matrix[i, j] = 1.0
            else:
                matrix[i, j] = sentence_bleu([texts[i].split()], texts[j].split(), smoothing_function=smooth)
    return matrix

def rouge_l_matrix(texts):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    n = len(texts)
    matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            try:
                score = scorer.score(texts[i], texts[j])['rougeL'].fmeasure
            except:
                score = 0.0
            matrix[i, j] = score
    return matrix

def levenshtein_similarity_matrix(texts):
    n = len(texts)
    matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            max_len = max(len(texts[i]), len(texts[j]))
            if max_len == 0:
                matrix[i, j] = 1.0
            else:
                dist = Levenshtein.distance(texts[i], texts[j])
                matrix[i, j] = 1 - dist / max_len
    return matrix

In [5]:
# ----------- Execution Flow ----------
DATA_DIR = f"ablation/PLM_mistral_7b_v03_instruct/mistral_7b_v03_instruct_data/all"
df = load_data(DATA_DIR, CSV_FILES)
texts, labels = sample_texts(df, "SYSTEM INSTRUCTION", NUM_ROWS, SEED)

# Compute all similarity matrices
mistral_matrices = [
    bleu_score_matrix(texts),
    rouge_l_matrix(texts),
    levenshtein_similarity_matrix(texts),
    tfidf_cosine(texts)
]

In [6]:
DATA_DIR = f"ablation/PLM_llama3_8b_instruct/llama3_8b_instruct_data/all"
df = load_data(DATA_DIR, CSV_FILES)
texts, labels = sample_texts(df, "SYSTEM INSTRUCTION", NUM_ROWS, SEED)

# Compute all similarity matrices
llama_matrices = [
    bleu_score_matrix(texts),
    rouge_l_matrix(texts),
    levenshtein_similarity_matrix(texts),
    tfidf_cosine(texts)
]

In [7]:
# --- Compute mean excluding diagonal ---
def mean_off_diagonal(matrix):
    n = matrix.shape[0]
    mask = ~np.eye(n, dtype=bool)
    return matrix[mask].mean()

# Compute mean scores for each metric
metric_means_mistral = [mean_off_diagonal(m) for m in mistral_matrices]  # BLEU, ROUGE, Leven, TFIDF for Mistral
metric_means_llama   = [mean_off_diagonal(m) for m in llama_matrices]  # same for Llama

all_model_vals = [metric_means_mistral, metric_means_llama]
model_names = ["Mistral-7B-v0.3", "Llama3-8b"]
metric_names = ["BLEU", "ROUGE-L", "Levenshtein", "TF-IDF"]

In [8]:
# --- Plot bar chart ---
def plot_combined_intra_means(all_model_vals, model_names, metric_names, output_file="ablation/combined_intra_metric.pdf"):
    all_model_vals = np.array(all_model_vals)  # shape: (2 models, 4 metrics)
    x = np.arange(len(metric_names))  # 4 metrics
    width = 0.35

    colors = ['#4e79a7', '#f28e2b']
    hatches = ['///', '+++']

    fig, ax = plt.subplots(figsize=(5.5, 3.5), dpi=600)

    for i, (model_vals, model_name) in enumerate(zip(all_model_vals, model_names)):
        ax.bar(
            x + (i - 0.5) * width,
            model_vals,
            width=width,
            label=model_name,
            color=colors[i],
            hatch=hatches[i],
            edgecolor='white',
            linewidth=0.8
        )

    # Labels and ticks
    ax.set_ylabel("Mean Score", fontsize=13)
    ax.set_xticks(x)
    ax.set_xticklabels(metric_names, fontsize=12)
    ax.set_ylim(0, 1.05)
    ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax.tick_params(axis='both', which='major', labelsize=12)

    # Style
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_color('gray')
    ax.spines['left'].set_color('gray')

    # Legend
    ax.legend(
        fontsize=11,
        loc="upper right",
        frameon=True,
        edgecolor='gray',
        facecolor='white',
        fancybox=False,
        framealpha=1
    )

    plt.tight_layout()
    plt.savefig(output_file, bbox_inches='tight')
    plt.close()
    
plot_combined_intra_means(all_model_vals, model_names, metric_names)

**Config**
---

In [20]:
# --- Configuration ---
PLM = "llama3_8b_instruct"
MLM = "llama3_8b_instruct"

# mistral_7b_v03_instruct
# llama3_8b_instruct

DATA_DIR = f"ablation/PLM_{PLM}/{MLM}_data/"

directories = ["all", "semantic", "lexical", "structural"]
comparison_targets = ["semantic", "lexical", "structural"]
NUM_ROWS = 300

**Inter**
---

In [21]:
# --- Load data ---
def load_system_instructions(directory):
    all_texts = []
    for file in CSV_FILES:
        path = os.path.join(DATA_DIR, directory, file)
        df = pd.read_csv(path, encoding="ISO-8859-1")
        all_texts.extend(df["SYSTEM INSTRUCTION"].tolist())
    return all_texts

instructions = {d: load_system_instructions(d) for d in directories}

# --- Fit TF-IDF on combined corpus ---
corpus = sum([instructions[d] for d in directories], [])
tfidf_vectorizer = TfidfVectorizer().fit(corpus)

# --- Scorers ---
smooth = SmoothingFunction().method1
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# --- Score storage ---
bleu_scores = []
rouge_scores = []
lev_scores = []
tfidf_scores = []

# --- Compute metrics for aligned rows ---
for target in comparison_targets:
    bleu_vec, rouge_vec, lev_vec, tfidf_vec = [], [], [], []

    for i in range(NUM_ROWS):
        src = instructions["all"][i]
        tgt = instructions[target][i]

        # BLEU
        bleu = sentence_bleu([src.split()], tgt.split(), smoothing_function=smooth)
        bleu_vec.append(bleu)

        # ROUGE-L
        try:
            rouge_score = rouge.score(src, tgt)['rougeL'].fmeasure
        except:
            rouge_score = 0.0
        rouge_vec.append(rouge_score)

        # Levenshtein
        max_len = max(len(src), len(tgt))
        lev = 1 - Levenshtein.distance(src, tgt) / max_len if max_len > 0 else 1.0
        lev_vec.append(lev)

        # TF-IDF Cosine
        tfidf = cosine_similarity(
            tfidf_vectorizer.transform([src]),
            tfidf_vectorizer.transform([tgt])
        )[0][0]
        tfidf_vec.append(tfidf)

    bleu_scores.append(bleu_vec)
    rouge_scores.append(rouge_vec)
    lev_scores.append(lev_vec)
    tfidf_scores.append(tfidf_vec)

# --- Convert to numpy arrays ---
bleu_scores = np.array(bleu_scores)
rouge_scores = np.array(rouge_scores)
lev_scores = np.array(lev_scores)
tfidf_scores = np.array(tfidf_scores)

# --- Plotting ---
def plot_combined_metric_heatmaps(matrices, titles, output_file=f"ablation/{MLM}_inter_metric.png"):
    fig, axes = plt.subplots(2, 2, figsize=(20, 10))
    y_labels = ["semantic", "lexical", "structural"]

    for ax, matrix, title in zip(axes.flat, matrices, titles):
        sns.heatmap(matrix, ax=ax, xticklabels=50 if matrix.shape[1] > 50 else 1,
                    yticklabels=y_labels, cmap="viridis", cbar=True)
        ax.set_title(title)
        ax.set_xlabel("Row Index")
        ax.set_ylabel("Compared With")

    plt.tight_layout()
    plt.savefig(output_file)
    plt.close()

# --- Run plot ---
plot_combined_metric_heatmaps(
    [bleu_scores, rouge_scores, lev_scores, tfidf_scores],
    ["BLEU Score", "ROUGE-L F1 Score", "Levenshtein Similarity", "TF-IDF Cosine Similarity"]
)

In [22]:
# --- Compute only means ---
mean_vals = [
    [np.mean(scores) for scores in metric]
    for metric in [bleu_scores, rouge_scores, lev_scores, tfidf_scores]
]

metric_names = ["BLEU", "ROUGE-L", "Levenshtein", "TF-IDF"]
comparison_labels = ["Semantic", "Lexical", "Structural"]

def plot_grouped_metric_means(mean_vals, metric_names, comparison_labels, output_file=f"ablation/mean_{MLM}_inter_metric.pdf", font_scale=1.0):
    mean_vals = np.array(mean_vals)
    x = np.arange(len(metric_names))
    width = 0.2

    hatches = ['///', '\\\\\\', '++']
    colors = ['#00546e', '#f4b6a6', '#88c9e3']

    fig, ax = plt.subplots(figsize=(6, 4), dpi=600)

    for i, label in enumerate(comparison_labels):
        ax.bar(
            x + (i - 1) * width, mean_vals[:, i],
            width=width,
            label=label,
            color=colors[i],
            hatch=hatches[i],
            edgecolor='white',
            linewidth=0.8
        )

    ax.set_ylabel("Mean Score", fontsize=14 * font_scale)
    ax.set_xticks(x)
    ax.set_xticklabels(metric_names, fontsize=13 * font_scale)
    ax.set_ylim(0, 1.05)
    ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax.tick_params(axis='both', which='major', labelsize=13 * font_scale)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_color('gray')
    ax.spines['left'].set_color('gray')

    ax.legend(
        fontsize=11 * font_scale,
        loc="upper right",
        frameon=True,
        edgecolor='gray',
        facecolor='white',
        fancybox=False,
        framealpha=1
    )

    plt.tight_layout()
    plt.savefig(output_file, bbox_inches='tight')
    plt.close()

In [23]:
plot_grouped_metric_means(mean_vals, metric_names, comparison_labels, font_scale=1.2)