In [2]:
import os
import json
import pandas as pd
from pathlib import Path

In [27]:
import json
import pandas as pd
from pathlib import Path
import re

ROOT_DIR = Path("faithfulness_eval")
MODEL_FOLDERS = ["gemini-1.5-pro-latest", "hugging_face", "openai_gpt4o_mini"]
records = []

def infer_metadata(file_path):
    model = file_path.parent.name
    filename = file_path.name.lower()

    # Determine report type from filename
    if "stock_history" in filename:
        report_type = "stock_history"
    elif "media" in filename:
        report_type = "media"
    elif "financial" in filename:
        report_type = "financial"
    elif "esg" in filename:
        report_type = "esg"
    else:
        report_type = "unknown"

    # Try to find ticker in filename
    match = re.search(r"_([A-Z]{2,6})_(stock|media|financial|esg)", file_path.name.upper())
    ticker = match.group(1) if match else None
    return model, report_type, ticker

def infer_score_from_explanation(explanation):
    if not isinstance(explanation, str):
        return None
    text = explanation.lower()
    if "perfectly faithful" in text or "highly faithful" in text or "accurately reflects" in text:
        return 1.0
    elif "largely faithful" in text:
        return 0.9
    elif "somewhat faithful" in text:
        return 0.5
    elif "not faithful" in text or "unfaithful" in text or "error" in text:
        return 0.0
    return None

def parse_file(file):
    model, report_type, fallback_ticker = infer_metadata(file)

    try:
        with open(file, "r") as f:
            data = json.load(f)
    except Exception as e:
        print(f"❌ JSON error in {file.name}: {e}")
        return

    # === CASE 1: BATCH FILE ===
    if isinstance(data, dict) and all(isinstance(v, dict) for v in data.values()):
        for ticker, entry in data.items():
            fe = entry.get("Faithfulness Evaluation")
            if isinstance(fe, dict):
                score = fe.get("Score")
                explanation = fe.get("Explanation")
            else:
                score = None
                explanation = fe
            if score is None:
                score = infer_score_from_explanation(explanation)
            records.append({
                "Ticker": ticker.upper(),
                "Model": model,
                "Report Type": report_type,
                "Score": score,
                "Explanation": explanation,
                "Path": str(file)
            })

    # === CASE 2: SINGLE ENTRY ===
    elif isinstance(data, dict):
        ticker = data.get("Ticker", fallback_ticker)
        fe = data.get("Faithfulness Evaluation")
        if isinstance(fe, dict):
            score = fe.get("Score")
            explanation = fe.get("Explanation")
        else:
            score = None
            explanation = fe
        if score is None:
            score = infer_score_from_explanation(explanation)
        records.append({
            "Ticker": ticker.upper() if ticker else None,
            "Model": model,
            "Report Type": report_type,
            "Score": score,
            "Explanation": explanation,
            "Path": str(file)
        })

    else:
        print(f"⚠️ Unrecognized structure in {file.name}")

# Process all model folders
for model_folder in MODEL_FOLDERS:
    folder = ROOT_DIR / model_folder
    if not folder.exists():
        continue
    for file in folder.glob("*.json"):
        parse_file(file)

# Convert to DataFrame
df = pd.DataFrame(records)
print(f"✅ Loaded {len(df)} evaluations")
display(df.head(20))


✅ Loaded 163 evaluations


Unnamed: 0,Ticker,Model,Report Type,Score,Explanation,Path
0,GOOGL,gemini-1.5-pro-latest,stock_history,,,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
1,MSFT,gemini-1.5-pro-latest,media,0.9,The generated report accurately reflects key p...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
2,AMZN,gemini-1.5-pro-latest,stock_history,,,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
3,GOOGL,gemini-1.5-pro-latest,media,1.0,The generated report accurately reflects the a...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
4,TSLA,gemini-1.5-pro-latest,stock_history,,,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
5,NFLX,gemini-1.5-pro-latest,media,0.9,The generated report accurately reflects the m...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
6,META,gemini-1.5-pro-latest,media,0.9,The generated report accurately summarizes sev...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
7,META,gemini-1.5-pro-latest,stock_history,,,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
8,AAPL,gemini-1.5-pro-latest,media,0.9,The generated report accurately reflects sever...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
9,TSLA,gemini-1.5-pro-latest,esg,,Faithfulness Score: **0.85**\n\nExplanation:\n...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...


In [31]:
# Ticker whitelist
TICKER_LIST = ["TSLA", "NVDA", "AAPL", "MSFT", "GOOGL", "META", "AMZN", "PLTR", "AMD", "NFLX"]

# Remove trailing spaces from tickers and filter
df["Ticker"] = df["Ticker"].str.strip()  # remove any whitespace
df = df[df["Ticker"].isin(TICKER_LIST)]

# Final display
print(f"✅ Filtered to {len(df)} evaluations for target tickers")
display(df.head(20))


✅ Filtered to 147 evaluations for target tickers


Unnamed: 0,Ticker,Model,Report Type,Score,Explanation,Path
0,GOOGL,gemini-1.5-pro-latest,stock_history,,,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
1,MSFT,gemini-1.5-pro-latest,media,0.9,The generated report accurately reflects key p...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
2,AMZN,gemini-1.5-pro-latest,stock_history,,,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
3,GOOGL,gemini-1.5-pro-latest,media,1.0,The generated report accurately reflects the a...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
4,TSLA,gemini-1.5-pro-latest,stock_history,,,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
5,NFLX,gemini-1.5-pro-latest,media,0.9,The generated report accurately reflects the m...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
6,META,gemini-1.5-pro-latest,media,0.9,The generated report accurately summarizes sev...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
7,META,gemini-1.5-pro-latest,stock_history,,,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
8,AAPL,gemini-1.5-pro-latest,media,0.9,The generated report accurately reflects sever...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...
9,TSLA,gemini-1.5-pro-latest,esg,,Faithfulness Score: **0.85**\n\nExplanation:\n...,faithfulness_eval/gemini-1.5-pro-latest/2025-0...


In [32]:
df.to_csv("eval.csv")

In [33]:
# Average score per model
model_avg = df.groupby("Model")["Score"].mean().reset_index().sort_values("Score", ascending=False)
print("🔍 Average Faithfulness by Model:")
display(model_avg)

# Average score per report type
report_avg = df.groupby("Report Type")["Score"].mean().reset_index().sort_values("Score", ascending=False)
print("\n📄 Average Faithfulness by Report Type:")
display(report_avg)

# Average score per model + report type
combo_avg = df.groupby(["Model", "Report Type"])["Score"].mean().reset_index().sort_values("Score", ascending=False)
print("\n📊 Average Faithfulness by Model + Report Type:")
display(combo_avg)

# Average score per ticker (optional)
ticker_avg = df.groupby("Ticker")["Score"].mean().reset_index().sort_values("Score", ascending=False)
print("\n💹 Average Faithfulness by Ticker:")
display(ticker_avg)


🔍 Average Faithfulness by Model:


Unnamed: 0,Model,Score
1,hugging_face,0.795
0,gemini-1.5-pro-latest,0.61
2,openai_gpt4o_mini,0.481579



📄 Average Faithfulness by Report Type:


Unnamed: 0,Report Type,Score
3,stock_history,0.94
2,media,0.859091
4,unknown,0.6
1,financial,0.466176
0,esg,0.45



📊 Average Faithfulness by Model + Report Type:


Unnamed: 0,Model,Report Type,Score
4,hugging_face,esg,1.0
10,openai_gpt4o_mini,stock_history,0.94
9,openai_gpt4o_mini,media,0.863636
2,gemini-1.5-pro-latest,media,0.854545
5,hugging_face,financial,0.785
6,hugging_face,unknown,0.6
8,openai_gpt4o_mini,financial,0.333333
7,openai_gpt4o_mini,esg,0.312903
0,gemini-1.5-pro-latest,esg,0.311111
1,gemini-1.5-pro-latest,financial,



💹 Average Faithfulness by Ticker:


Unnamed: 0,Ticker,Score
5,MSFT,0.855556
3,GOOGL,0.644444
9,TSLA,0.6
0,AAPL,0.59375
8,PLTR,0.55
4,META,0.542857
7,NVDA,0.542857
6,NFLX,0.539286
1,AMD,0.525
2,AMZN,0.485714
