In [1]:
import openai
print("‚úÖ OpenAI version:", openai.__version__)


‚úÖ OpenAI version: 1.55.3


In [2]:
import sys
print(sys.executable)


/usr/local/bin/python3


In [3]:
from dotenv import load_dotenv
import os

# Load .env
load_dotenv()

# Get key and ensure it's visible to the environment
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key  # üëà ensures new SDK can auto-detect it

if api_key:
    print("‚úÖ Key loaded and exported:", api_key[:12] + "..." + api_key[-5:])
else:
    raise ValueError("‚ùå No API key found ‚Äî check your .env file")


‚úÖ Key loaded and exported: sk-proj-dtvl...dV18A


In [4]:
from openai import OpenAI
client = OpenAI()  # now auto-detects the key
print("‚úÖ Client initialized successfully.")


‚úÖ Client initialized successfully.


In [5]:
# =========================================================
# 1Ô∏è‚É£ Imports & Setup
# =========================================================
from dotenv import load_dotenv
import os, pandas as pd, numpy as np
from tqdm import tqdm
from openai import OpenAI
import random, json
import time
time.sleep(25)   # wait 25 seconds between calls


# Load OpenAI Key from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if api_key:
    print("‚úÖ Key loaded:", api_key[:12] + "..." + api_key[-5:])
else:
    raise ValueError("‚ùå No API key found. Please add OPENAI_API_KEY to your .env")

client = OpenAI(api_key=api_key)

# =========================================================
# 2Ô∏è‚É£ Load Dataset
# =========================================================
df = pd.read_csv("/Users/pulinkumar/Desktop/ALGOVERSE/Financial-QA-10k.csv")
df = df.fillna("")
print(f"‚úÖ Dataset loaded: {df.shape}")
print(df.head(3))

# Just in case columns differ
text_col = "context"
question_col = "question"
answer_col = "answer"

# =========================================================
# 3Ô∏è‚É£ Generate Model Predictions (Baseline LLMs)
# =========================================================
# You can replace these with actual calls to local models (Mistral / LLaMA)
# For now, we simulate random model predictions
def mock_model_prediction(question):
    endings = [
        "increased financial stability.",
        "improved risk-adjusted returns.",
        "lowered capital costs.",
        "helped diversification efforts."
    ]
    return f"The company's strategy led to {random.choice(endings)}"

df["mistral_pred"] = df[question_col].apply(lambda q: mock_model_prediction(q))
df["llama_pred"] = df[question_col].apply(lambda q: mock_model_prediction(q))

print("‚úÖ Mock model predictions added for Mistral & LLaMA.")

# =========================================================
# 4Ô∏è‚É£ Define the LLM-as-a-Judge function
# =========================================================
def llm_as_judge(question, true_answer, pred_answer, judge_model="gpt-4o-mini"):
    """
    Use ChatGPT (GPT-5/4o-mini) to semantically evaluate model prediction quality.
    """
    prompt = f"""
You are a financial QA evaluation assistant.

Question: {question}

Ground Truth Answer: {true_answer}
Predicted Answer: {pred_answer}

Evaluate the predicted answer from 0.0 to 1.0 based on:
1. **Correctness** ‚Äì factual alignment with the ground truth.
2. **Completeness** ‚Äì inclusion of key details.
3. **Relevance** ‚Äì focus on answering the question.

Return only valid JSON:
{{"score": 0.xx, "justification": "short reasoning"}}
"""
    try:
        response = client.chat.completions.create(
            model=judge_model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_tokens=200
        )
        content = response.choices[0].message.content.strip()
        result = json.loads(content)
        return result.get("score", 0.0), result.get("justification", "")
    except Exception as e:
        print("‚ö†Ô∏è Error:", e)
        return None, str(e)

# =========================================================
# 5Ô∏è‚É£ Evaluate Mistral & LLaMA Predictions
# =========================================================
results = []
judge_model = "gpt-4o-mini"  # or "gpt-5" if available

for model_name in ["mistral_pred", "llama_pred"]:
    print(f"\nüîé Evaluating model: {model_name} with judge: {judge_model}")
    scores, reasons = [], []

    for _, row in tqdm(df.head(50).iterrows(), total=50):
        s, r = llm_as_judge(
            question=row[question_col],
            true_answer=row[answer_col],
            pred_answer=row[model_name],
            judge_model=judge_model
        )
        scores.append(s)
        reasons.append(r)

   # only attach to evaluated subset
    df_subset = df.head(len(scores)).copy()
    df_subset[f"{model_name}_score"] = scores
    df_subset[f"{model_name}_reason"] = reasons

    avg_score = np.mean([s for s in scores if s is not None])
    print(f"‚úÖ {model_name} ‚Üí Average Judge Score: {avg_score:.3f}")

    results.append({"model": model_name, "avg_score": avg_score, "judge": judge_model})

# =========================================================
# 6Ô∏è‚É£ Save Results
# =========================================================
results_df = pd.DataFrame(results)
results_df.to_csv("/Users/pulinkumar/Desktop/ALGOVERSE/Financial-Agent-LLM-Judge-Results.csv", index=False)
df.to_csv("/Users/pulinkumar/Desktop/ALGOVERSE/Financial-Agent-LLM-Detailed.csv", index=False)
print("\nüíæ Results saved to:")
print("  ‚Ä¢ Financial-Agent-LLM-Judge-Results.csv")
print("  ‚Ä¢ Financial-Agent-LLM-Detailed.csv")


‚úÖ Key loaded: sk-proj-dtvl...dV18A
‚úÖ Dataset loaded: (7000, 5)
                                            question  \
0  What area did NVIDIA initially focus on before...   
1  What are some of the recent applications of GP...   
2  What significant invention did NVIDIA create i...   

                                              answer  \
0           NVIDIA initially focused on PC graphics.   
1  Recent applications of GPU-powered deep learni...   
2                   NVIDIA invented the GPU in 1999.   

                                             context ticker    filing  
0  Since our original focus on PC graphics, we ha...   NVDA  2023_10K  
1  Some of the most recent applications of GPU-po...   NVDA  2023_10K  
2  Our invention of the GPU in 1999 defined moder...   NVDA  2023_10K  
‚úÖ Mock model predictions added for Mistral & LLaMA.

üîé Evaluating model: mistral_pred with judge: gpt-4o-mini


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [15:57<00:00, 19.16s/it]


‚úÖ mistral_pred ‚Üí Average Judge Score: 0.018

üîé Evaluating model: llama_pred with judge: gpt-4o-mini


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [22:12<00:00, 26.65s/it]

‚úÖ llama_pred ‚Üí Average Judge Score: 0.028

üíæ Results saved to:
  ‚Ä¢ Financial-Agent-LLM-Judge-Results.csv
  ‚Ä¢ Financial-Agent-LLM-Detailed.csv



