In [59]:
import os
import json
import re
import html
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
results_folder = "../Results"

# 1. General Similarity

- Model: ProsusAI FinBERT (A Sentence BERT model further trained on financial text)


In [3]:
model_name = "ProsusAI/finbert"
model = SentenceTransformer(model_name)
print(f"Loaded model: {model_name}\n")


No sentence-transformers model found with name ProsusAI/finbert. Creating a new one with MEAN pooling.


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Loaded model: ProsusAI/finbert



In [8]:
summary_SBERT = []

evaluation_SBERT_folder = "../Results/Evaluation/SBERT"


for file_name in sorted(os.listdir(results_folder)):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(results_folder, file_name)
    print(f"Processing file: {file_name}")

    # Load the JSON structure
    with open(file_path, "r", encoding="utf-8") as f:
        file_data = json.load(f)

    results = file_data.get("results", [])
    if not results:
        print(f"No results found in {file_name}, skipping.")
        continue

    # Compute cosine similarities
    similarities = []
    for r in tqdm(results, desc=f"Computing similarities for {file_name}"):
        gt = r.get("ground_truth_answer", "").strip()
        gen = r.get("generated_answer", "").strip()

        if not gt or not gen:
            sim = None
        else:
            emb_gt = model.encode(gt, convert_to_tensor=True)
            emb_gen = model.encode(gen, convert_to_tensor=True)
            sim = util.cos_sim(emb_gt, emb_gen).item()

        r["similarity"] = round(sim, 4) if sim is not None else None
        similarities.append(sim)

    # Save per-file CSV
    df = pd.DataFrame(results)
    output_csv = os.path.join(evaluation_SBERT_folder, file_name.replace(".json", "_similarity.csv"))
    df.to_csv(output_csv, index=False)
    print(f"Saved per-sample results to: {output_csv}")

    # Compute and store summary metrics
    valid_sims = [s for s in similarities if s is not None]
    avg_sim = sum(valid_sims) / len(valid_sims) if valid_sims else 0.0
    summary_SBERT.append({
        "file": file_name,
        "specification": file_data.get("specification", ""),
        "description": file_data.get("description", ""),
        "samples": len(valid_sims),
        "average_similarity": round(avg_sim, 4)
    })


Processing file: spec1_random_500_20251110_180802.json


Computing similarities for spec1_random_500_20251110_180802.json: 100%|██████████| 43/43 [00:03<00:00, 13.99it/s]


Saved per-sample results to: ../Results/Evaluation/SBERT/spec1_random_500_20251110_180802_similarity.csv
Processing file: spec2_recent_500_20251110_181330.json


Computing similarities for spec2_recent_500_20251110_181330.json: 100%|██████████| 43/43 [00:03<00:00, 13.74it/s]


Saved per-sample results to: ../Results/Evaluation/SBERT/spec2_recent_500_20251110_181330_similarity.csv
Processing file: spec3_recent_500_plus_10q_20251110_182010.json


Computing similarities for spec3_recent_500_plus_10q_20251110_182010.json: 100%|██████████| 43/43 [00:03<00:00, 13.17it/s]


Saved per-sample results to: ../Results/Evaluation/SBERT/spec3_recent_500_plus_10q_20251110_182010_similarity.csv
Processing file: spec4_persona_plus_10q_20251110_182428.json


Computing similarities for spec4_persona_plus_10q_20251110_182428.json: 100%|██████████| 43/43 [00:03<00:00, 12.82it/s]

Saved per-sample results to: ../Results/Evaluation/SBERT/spec4_persona_plus_10q_20251110_182428_similarity.csv





In [11]:
if summary_SBERT:
    summary_SBERT_df = pd.DataFrame(summary_SBERT).sort_values(by="average_similarity", ascending=False)
    summary_SBERT_df_csv = os.path.join(evaluation_SBERT_folder, "SBERT.csv")
    summary_SBERT_df.to_csv(summary_SBERT_df_csv, index=False)
    print(f"Summary saved to {summary_SBERT_df_csv}\n")
else:
    print("No valid JSON result files found.")

Summary saved to ../Results/Evaluation/SBERT/SBERT.csv



# 2. Informational Accuracy

- Model: NLI (Natural Language Inference)

- This evaluation measures informational accuracy through a bidirectional Natural Language Inference (NLI) process. For each question–answer pair, the model assesses the logical relationship between the generated answer and the ground truth in both directions. In the first direction, the generated answer is treated as the premise and the ground truth as the hypothesis, testing whether the generated response correctly entails the factual content of the reference. This captures **factual accuracy**, ensuring the model does not introduce contradictions or false information. In the reverse direction, the ground truth serves as the premise and the generated answer as the hypothesis, testing whether the model’s response covers all the essential information expressed in the reference. This second direction captures **completeness** or **coverage**. Averaging the entailment probabilities from both directions provides a balanced measure of informational accuracy, reflecting how well the generated answer aligns with and fully represents the reference answer’s meaning.

### **Output Explanation**

#### **1. Entailment**

**gen → gt (Generated entails Ground Truth)**  
This means the generated answer **supports or implies** what is stated in the ground truth.  
A high entailment score in this direction shows that the generated answer is **factually accurate** — it correctly reflects or summarizes the truth provided in the reference.

**gt → gen (Ground Truth entails Generated)**  
This means the ground truth **covers or includes** what the generated answer says.  
A high score here indicates the generated answer is **complete** — it does not omit key information from the ground truth.


#### **2. Contradiction**

**gen → gt**  
A high contradiction score here indicates the generated answer **conflicts with the ground truth** — it says something that cannot be true if the ground truth is true.  
This captures **factual errors** or **misleading statements** in the generated text.

**gt → gen**  
This means the ground truth **conflicts with what the generated answer says**.  
A high score suggests the generated answer **omits or reverses facts** stated in the reference.


#### **3. Neutral**

**gen → gt**  
A high neutral score here means the generated answer **neither confirms nor denies** the ground truth.  
It might be too vague or general — the model can’t tell whether it’s right or wrong.  
This usually signals **lack of specificity** or **partial coverage**.

**gt → gen**  
A high neutral score here means the ground truth contains information that the generated answer **does not address**.  
This indicates **missing details** — the generated answer fails to cover factual statements made in the reference.

In [14]:
device = 0 if torch.cuda.is_available() else -1
model_name = "facebook/bart-large-mnli"

nli_pipe = pipeline("text-classification", model=model_name, tokenizer=model_name, device=device)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [58]:
# ---------------------------------------------------------------------
# Folder Setup
# ---------------------------------------------------------------------
results_folder = "../Results"
evaluation_NLI_folder = "../Results/Evaluation/NLI"
os.makedirs(evaluation_NLI_folder, exist_ok=True)

summary_nli = []

# ---------------------------------------------------------------------
# Helper: Extract label and score (single output)
# ---------------------------------------------------------------------
def extract_label_and_score(output):
    """
    Extracts the label and score from the model output.
    Example output: [{'label': 'neutral', 'score': 0.98}]
    """
    if isinstance(output, list) and len(output) > 0:
        item = output[0]
        label = str(item.get("label", "")).lower()
        score = float(item.get("score", 0.0))
        return label, score
    elif isinstance(output, dict):
        label = str(output.get("label", "")).lower()
        score = float(output.get("score", 0.0))
        return label, score
    return None, None

# ---------------------------------------------------------------------
# Main Loop
# ---------------------------------------------------------------------
for file_name in sorted(os.listdir(results_folder)):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(results_folder, file_name)
    print(f"\nEvaluating informational accuracy for: {file_name}")

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    results = data.get("results", [])
    entailment_scores = []
    contradiction_scores = []
    neutral_scores = []

    for r in tqdm(results, desc=f"NLI scoring for {file_name}"):
        gt = r.get("ground_truth_answer", "").strip().replace("\n", " ").replace("\"", "'")
        gen = r.get("generated_answer", "").strip().replace("\n", " ").replace("\"", "'")

        if not gt or not gen:
            r["informational_accuracy"] = None
            continue

        try:
            # Bidirectional inference
            out1 = nli_pipe({"text": gen, "text_pair": gt})
            out2 = nli_pipe({"text": gt, "text_pair": gen})

            label1, score1 = extract_label_and_score(out1)
            label2, score2 = extract_label_and_score(out2)

            # Store both directions
            r["gen_to_gt_label"] = label1
            r["gen_to_gt_score"] = score1
            r["gt_to_gen_label"] = label2
            r["gt_to_gen_score"] = score2


            # Collect per-type scores for summary
            if label1 == "entailment":
                entailment_scores.append(score1)
            elif label1 == "contradiction":
                contradiction_scores.append(score1)
            elif label1 == "neutral":
                neutral_scores.append(score1)

        except Exception as e:
            print(f"Error for sample: {e}")

    # -----------------------------------------------------------------
    # Save Per-File Results
    # -----------------------------------------------------------------
    df = pd.DataFrame(results)
    out_csv = os.path.join(evaluation_NLI_folder, file_name.replace(".json", "_info_accuracy.csv"))
    df.to_csv(out_csv, index=False)
    print(f"Saved per-sample informational accuracy to: {out_csv}")

    # -----------------------------------------------------------------
    # Summary Statistics
    # -----------------------------------------------------------------
    avg_entail = sum(entailment_scores) / len(entailment_scores) if entailment_scores else 0.0
    avg_contradiction = sum(contradiction_scores) / len(contradiction_scores) if contradiction_scores else 0.0
    avg_neutral = sum(neutral_scores) / len(neutral_scores) if neutral_scores else 0.0
    samples = len(results)

    summary_nli.append({
        "file": file_name,
        "specification": data.get("specification", ""),
        "samples": samples,
        "avg_entail_score": round(avg_entail, 4),
        "avg_contradiction_score": round(avg_contradiction, 4),
        "avg_neutral_score": round(avg_neutral, 4),
    })

# ---------------------------------------------------------------------
# Save Summary
# ---------------------------------------------------------------------
summary_NLI_df = pd.DataFrame(summary_nli).sort_values(by="avg_entail_score", ascending=False)
summary_NLI_csv = os.path.join(evaluation_NLI_folder, "summary_informational_accuracy.csv")
summary_NLI_df.to_csv(summary_NLI_csv, index=False)

print(f"\nSaved summary informational accuracy to: {summary_NLI_csv}")



Evaluating informational accuracy for: spec1_random_500_20251110_180802.json


NLI scoring for spec1_random_500_20251110_180802.json: 100%|██████████| 43/43 [00:41<00:00,  1.05it/s]


Saved per-sample informational accuracy to: ../Results/Evaluation/NLI/spec1_random_500_20251110_180802_info_accuracy.csv

Evaluating informational accuracy for: spec2_recent_500_20251110_181330.json


NLI scoring for spec2_recent_500_20251110_181330.json: 100%|██████████| 43/43 [00:42<00:00,  1.01it/s]


Saved per-sample informational accuracy to: ../Results/Evaluation/NLI/spec2_recent_500_20251110_181330_info_accuracy.csv

Evaluating informational accuracy for: spec3_recent_500_plus_10q_20251110_182010.json


NLI scoring for spec3_recent_500_plus_10q_20251110_182010.json: 100%|██████████| 43/43 [00:45<00:00,  1.07s/it]


Saved per-sample informational accuracy to: ../Results/Evaluation/NLI/spec3_recent_500_plus_10q_20251110_182010_info_accuracy.csv

Evaluating informational accuracy for: spec4_persona_plus_10q_20251110_182428.json


NLI scoring for spec4_persona_plus_10q_20251110_182428.json: 100%|██████████| 43/43 [00:48<00:00,  1.12s/it]

Saved per-sample informational accuracy to: ../Results/Evaluation/NLI/spec4_persona_plus_10q_20251110_182428_info_accuracy.csv

Saved summary informational accuracy to: ../Results/Evaluation/NLI/summary_informational_accuracy.csv





# 3. Tone Alignment

This code evaluates **tone alignment** between generated and ground-truth answers using the **Loughran & McDonald (2011)** financial sentiment framework.

It combines **FinBERT**’s model-based tone detection (for *positive*, *neutral*, and *negative*) with a **lexicon-based approach** that identifies additional tones — *uncertainty*, *litigious*, *constraining*, and *modal* — using keyword frequencies from the L&M dictionary.

For each answer pair, it creates a **7-dimensional tone vector** representing these categories, then calculates the **cosine similarity** between the generated and ground-truth vectors to measure how closely their tones align.

The script outputs detailed per-sample tone labels and scores, along with a summary file reporting the **average tone alignment** for each dataset.


In [None]:
# --------------------------------------------------------------
# Folder Setup
# --------------------------------------------------------------
results_folder = "../Results"
evaluation_tone_folder = "../Results/Evaluation/Tone"
os.makedirs(evaluation_tone_folder, exist_ok=True)

summary_tone = []

# --------------------------------------------------------------
# Load FinBERT (Positive / Neutral / Negative)
# --------------------------------------------------------------
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

finbert_labels = ["positive", "neutral", "negative"]

def get_finbert_probs(text):
    """Return normalized FinBERT sentiment probabilities for a given text."""
    if not isinstance(text, str) or len(text.strip()) == 0:
        return np.array([0.0, 0.0, 0.0])
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1).numpy()[0]
    return probs


# --------------------------------------------------------------
# Loughran & McDonald Lexicon Extensions
# --------------------------------------------------------------
lm_lexicons = {
    "uncertainty": [
        "uncertain", "unsure", "risk", "doubt", "fluctuate", "potential", "contingent",
        "approximately", "maybe", "possible", "possibly", "likely", "unlikely"
    ],
    "litigious": [
        "claim", "lawsuit", "litigation", "sue", "settlement", "regulation", "penalty",
        "court", "compliance", "dispute", "infringement"
    ],
    "constraining": [
        "restrict", "limit", "bound", "constraint", "mandatory", "require", "prohibit",
        "ban", "obligation", "restrictive"
    ],
    "modal": [
        "should", "could", "would", "might", "may", "can", "must", "shall"
    ]
}

def lexicon_tone_score(text):
    """Compute frequency-based scores for additional tone categories."""
    text_lower = text.lower()
    scores = []
    for cat, words in lm_lexicons.items():
        count = sum(len(re.findall(rf"\b{w}\b", text_lower)) for w in words)
        scores.append(count)
    # normalize to [0, 1]
    total = sum(scores) if sum(scores) > 0 else 1
    return np.array([s / total for s in scores])


# --------------------------------------------------------------
# Combine FinBERT + Lexicon into 7-Dim Tone Vector
# --------------------------------------------------------------
def get_tone_vector(text):
    finbert_probs = get_finbert_probs(text)  # 3 dims
    lexicon_scores = lexicon_tone_score(text)  # 4 dims
    combined = np.concatenate([finbert_probs, lexicon_scores])  # total 7 dims
    # normalize the whole vector
    if np.sum(combined) > 0:
        combined = combined / np.sum(combined)
    return combined


In [60]:
# --------------------------------------------------------------
# Process Each JSON File
# --------------------------------------------------------------
for file_name in sorted(os.listdir(results_folder)):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(results_folder, file_name)
    print(f"\nEvaluating tone alignment for: {file_name}")

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    results = data.get("results", [])
    tone_scores = []

    for r in tqdm(results, desc=f"Tone alignment for {file_name}"):
        gt = r.get("ground_truth_answer", "").strip().replace("\n", " ").replace("\"", "'")
        gen = r.get("generated_answer", "").strip().replace("\n", " ").replace("\"", "'")

        if not gt or not gen:
            r["tone_alignment"] = None
            continue

        try:
            # Get 7-dim tone probability vectors
            gt_tone = get_tone_vector(gt)
            gen_tone = get_tone_vector(gen)

            # Compute cosine similarity
            tone_score = float(cosine_similarity([gt_tone], [gen_tone])[0][0])

            # Record top tone categories
            labels_7 = ["positive", "neutral", "negative", "uncertainty", "litigious", "constraining", "modal"]
            r["tone_alignment"] = round(tone_score, 4)
            r["gt_tone_label"] = labels_7[int(np.argmax(gt_tone))]
            r["gen_tone_label"] = labels_7[int(np.argmax(gen_tone))]

            tone_scores.append(tone_score)

        except Exception as e:
            print(f"Error for sample: {e}")
            r["tone_alignment"] = None

    # ----------------------------------------------------------
    # Save Per-File Results
    # ----------------------------------------------------------
    df = pd.DataFrame(results)
    out_csv = os.path.join(evaluation_tone_folder, file_name.replace(".json", "_tone_alignment.csv"))
    df.to_csv(out_csv, index=False)
    print(f"Saved per-sample tone alignment to: {out_csv}")

    # ----------------------------------------------------------
    # Summary Statistics
    # ----------------------------------------------------------
    valid_scores = [s for s in tone_scores if s is not None]
    avg_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0

    summary_tone.append({
        "file": file_name,
        "specification": data.get("specification", ""),
        "samples": len(valid_scores),
        "avg_tone_alignment": round(avg_score, 4)
    })


# --------------------------------------------------------------
# Save Summary Across All Files
# --------------------------------------------------------------
summary_tone_df = pd.DataFrame(summary_tone).sort_values(by="avg_tone_alignment", ascending=False)
summary_tone_csv = os.path.join(evaluation_tone_folder, "summary_tone_alignment.csv")
summary_tone_df.to_csv(summary_tone_csv, index=False)

print(f"\nSaved summary tone alignment to: {summary_tone_csv}")


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 56e12f19-5c0f-4837-83df-9299a175e1af)')' thrown while requesting HEAD https://huggingface.co/yiyanghkust/finbert-tone/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]


Evaluating tone alignment for: spec1_random_500_20251110_180802.json


Tone alignment for spec1_random_500_20251110_180802.json:   0%|          | 0/43 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Error for sample: The size of tensor a (583) must match the size of tensor b (512) at non-singleton dimension 1


Tone alignment for spec1_random_500_20251110_180802.json: 100%|██████████| 43/43 [00:06<00:00,  6.65it/s]


Saved per-sample tone alignment to: ../Results/Evaluation/Tone/spec1_random_500_20251110_180802_tone_alignment.csv

Evaluating tone alignment for: spec2_recent_500_20251110_181330.json


Tone alignment for spec2_recent_500_20251110_181330.json:   5%|▍         | 2/43 [00:00<00:03, 13.39it/s]

Error for sample: The size of tensor a (583) must match the size of tensor b (512) at non-singleton dimension 1


Tone alignment for spec2_recent_500_20251110_181330.json: 100%|██████████| 43/43 [00:06<00:00,  6.24it/s]


Saved per-sample tone alignment to: ../Results/Evaluation/Tone/spec2_recent_500_20251110_181330_tone_alignment.csv

Evaluating tone alignment for: spec3_recent_500_plus_10q_20251110_182010.json


Tone alignment for spec3_recent_500_plus_10q_20251110_182010.json:   5%|▍         | 2/43 [00:00<00:02, 13.68it/s]

Error for sample: The size of tensor a (583) must match the size of tensor b (512) at non-singleton dimension 1


Tone alignment for spec3_recent_500_plus_10q_20251110_182010.json: 100%|██████████| 43/43 [00:06<00:00,  6.27it/s]


Saved per-sample tone alignment to: ../Results/Evaluation/Tone/spec3_recent_500_plus_10q_20251110_182010_tone_alignment.csv

Evaluating tone alignment for: spec4_persona_plus_10q_20251110_182428.json


Tone alignment for spec4_persona_plus_10q_20251110_182428.json:   5%|▍         | 2/43 [00:00<00:03, 11.30it/s]

Error for sample: The size of tensor a (583) must match the size of tensor b (512) at non-singleton dimension 1


Tone alignment for spec4_persona_plus_10q_20251110_182428.json:  98%|█████████▊| 42/43 [00:06<00:00,  6.40it/s]

Error for sample: The size of tensor a (628) must match the size of tensor b (512) at non-singleton dimension 1


Tone alignment for spec4_persona_plus_10q_20251110_182428.json: 100%|██████████| 43/43 [00:06<00:00,  6.23it/s]

Saved per-sample tone alignment to: ../Results/Evaluation/Tone/spec4_persona_plus_10q_20251110_182428_tone_alignment.csv

Saved summary tone alignment to: ../Results/Evaluation/Tone/summary_tone_alignment.csv



