In [None]:
#Imports & Environment Setup
import os, time, json, random, numpy as np, pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import faiss

# Load key from .env (make sure .env has: OPENAI_API_KEY=sk-...)
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
assert api_key, "‚ùå No API key found. Add OPENAI_API_KEY to your .env file."

client = OpenAI(api_key=api_key)
print("‚úÖ OpenAI client initialized successfully.")


‚úÖ OpenAI client initialized successfully.


In [None]:
#Load 
CSV_PATH = "/Users/pulinkumar/Desktop/ALGOVERSE/Financial-QA-10k.csv"
df = pd.read_csv(CSV_PATH).fillna("")
print("‚úÖ Dataset:", df.shape)
print(df.head(3))

# dataset already uses these:
QUESTION_COL = "question"
ANSWER_COL   = "answer"
CONTEXT_COL  = "context"


‚úÖ Dataset: (7000, 5)
                                            question  \
0  What area did NVIDIA initially focus on before...   
1  What are some of the recent applications of GP...   
2  What significant invention did NVIDIA create i...   

                                              answer  \
0           NVIDIA initially focused on PC graphics.   
1  Recent applications of GPU-powered deep learni...   
2                   NVIDIA invented the GPU in 1999.   

                                             context ticker    filing  
0  Since our original focus on PC graphics, we ha...   NVDA  2023_10K  
1  Some of the most recent applications of GPU-po...   NVDA  2023_10K  
2  Our invention of the GPU in 1999 defined moder...   NVDA  2023_10K  


In [None]:

# Retrieval-Augmented Financial QA from Transformer & reader model
print("üîç Building FAISS index...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
context_embeddings = embed_model.encode(df["context"].tolist(), show_progress_bar=True)
dim = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(context_embeddings)
print(f"‚úÖ FAISS index built with {index.ntotal} vectors.")

# Load QA Reader
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
reader_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
reader = pipeline("question-answering", model=reader_model, tokenizer=tokenizer)

def retrieve_contexts(question, top_k=3):
    q_vec = embed_model.encode([question])
    D, I = index.search(q_vec, top_k)
    return [df.iloc[i]["context"] for i in I[0]]

def answer_question(question):
    contexts = retrieve_contexts(question)
    answers = []
    for ctx in contexts:
        ans = reader(question=question, context=ctx)
        answers.append(ans)
    best = max(answers, key=lambda x: x["score"])
    return best["answer"]


üîç Building FAISS index...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 219/219 [06:18<00:00,  1.73s/it]


‚úÖ FAISS index built with 7000 vectors.


Device set to use mps:0


In [15]:
subset = df.head(10).copy()
subset["retrieved_answer"] = subset["question"].apply(answer_question)

def exact_match(pred, truth):
    return int(pred.strip().lower() == truth.strip().lower())

def f1_score(pred, truth):
    pred_tokens, truth_tokens = pred.lower().split(), truth.lower().split()
    common = len(set(pred_tokens) & set(truth_tokens))
    if common == 0: return 0
    precision, recall = common / len(pred_tokens), common / len(truth_tokens)
    return 2 * precision * recall / (precision + recall)

subset["EM"] = subset.apply(lambda r: exact_match(r["retrieved_answer"], r["answer"]), axis=1)
subset["F1"] = subset.apply(lambda r: f1_score(r["retrieved_answer"], r["answer"]), axis=1)

print(f"‚úÖ Experiment 1 Complete ‚Üí EM={subset['EM'].mean():.3f}, F1={subset['F1'].mean():.3f}")
subset.to_csv("/Users/pulinkumar/Desktop/ALGOVERSE/Exp1_RetrievalQA.csv", index=False)


‚úÖ Experiment 1 Complete ‚Üí EM=0.000, F1=0.277


In [14]:
subset = df.head(50).copy()
subset["retrieved_answer"] = subset["question"].apply(answer_question)

def exact_match(pred, truth):
    return int(pred.strip().lower() == truth.strip().lower())

def f1_score(pred, truth):
    pred_tokens, truth_tokens = pred.lower().split(), truth.lower().split()
    common = len(set(pred_tokens) & set(truth_tokens))
    if common == 0: return 0
    precision, recall = common / len(pred_tokens), common / len(truth_tokens)
    return 2 * precision * recall / (precision + recall)

subset["EM"] = subset.apply(lambda r: exact_match(r["retrieved_answer"], r["answer"]), axis=1)
subset["F1"] = subset.apply(lambda r: f1_score(r["retrieved_answer"], r["answer"]), axis=1)

print(f"‚úÖ Experiment 1 Complete ‚Üí EM={subset['EM'].mean():.3f}, F1={subset['F1'].mean():.3f}")
subset.to_csv("/Users/pulinkumar/Desktop/ALGOVERSE/Exp1_RetrievalQA.csv", index=False)


‚úÖ Experiment 1 Complete ‚Üí EM=0.040, F1=0.248


In [16]:
subset = df.head(100).copy()
subset["retrieved_answer"] = subset["question"].apply(answer_question)

def exact_match(pred, truth):
    return int(pred.strip().lower() == truth.strip().lower())

def f1_score(pred, truth):
    pred_tokens, truth_tokens = pred.lower().split(), truth.lower().split()
    common = len(set(pred_tokens) & set(truth_tokens))
    if common == 0: return 0
    precision, recall = common / len(pred_tokens), common / len(truth_tokens)
    return 2 * precision * recall / (precision + recall)

subset["EM"] = subset.apply(lambda r: exact_match(r["retrieved_answer"], r["answer"]), axis=1)
subset["F1"] = subset.apply(lambda r: f1_score(r["retrieved_answer"], r["answer"]), axis=1)

print(f"‚úÖ Experiment 1 Complete ‚Üí EM={subset['EM'].mean():.3f}, F1={subset['F1'].mean():.3f}")
subset.to_csv("/Users/pulinkumar/Desktop/ALGOVERSE/Exp1_RetrievalQA.csv", index=False)


‚úÖ Experiment 1 Complete ‚Üí EM=0.020, F1=0.207


In [None]:
#Enhanced Quick Test ‚Äî Retrieval QA with per-context scores

def ask(question, top_k=3):
    """
    Runs retrieval + reader pipeline on a custom question.
    Shows predicted answers and confidence scores for each retrieved context.
    """
    contexts = retrieve_contexts(question, top_k=top_k)
    detailed_answers = []

    for i, ctx in enumerate(contexts):
        ans = reader(question=question, context=ctx)
        detailed_answers.append({
            "context_id": i + 1,
            "predicted_answer": ans["answer"],
            "confidence": round(ans["score"], 3),
            "context_snippet": ctx[:250] + ("..." if len(ctx) > 250 else "")
        })

    # pick best answer by confidence
    best = max(detailed_answers, key=lambda x: x["confidence"])

    return {
        "question": question,
        "best_answer": best["predicted_answer"],
        "best_confidence": best["confidence"],
        "all_context_answers": detailed_answers
    }

# üß† Run a test query
sample_q = "How do interest rate hikes affect bond prices?"
out = ask(sample_q, top_k=3)

import json
print(json.dumps(out, indent=2))


{
  "question": "How do interest rate hikes affect bond prices?",
  "best_answer": "higher market interest rates offered for retail deposits",
  "best_confidence": 0.335,
  "all_context_answers": [
    {
      "context_id": 1,
      "predicted_answer": "increase our future borrowing costs",
      "confidence": 0.301,
      "context_snippet": "In addition, economic conditions and actions by policymaking bodies are contributing to changing interest rates and significant capital market volatility, which, along with any increases in our borrowing levels, could increase our future borrowing co..."
    },
    {
      "context_id": 2,
      "predicted_answer": "higher market interest rates offered for retail deposits",
      "confidence": 0.335,
      "context_snippet": "The increase in interest rates paid on our deposits were primarily due to the impact of higher market interest rates offered for retail deposits."
    },
    {
      "context_id": 3,
      "predicted_answer": "Interest expens

In [None]:
#Direct LLM (gpt-4o) Generated Financial QA
from openai import OpenAI
import json, os
from tqdm import tqdm

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def generate_answer_llm(question, context=None, model="gpt-4o"):
    """
    Uses GPT model to generate financial QA answers.
    Optionally includes a retrieved context for grounding.
    """
    if context:
        prompt = f"""
You are a financial analyst AI assistant.
Using the following financial filing excerpt, answer the question concisely and factually.

Context:
{context}

Question: {question}

If the context doesn't contain enough information, state that clearly.
"""
    else:
        prompt = f"You are a financial analyst AI assistant. Answer the question clearly and factually.\n\nQuestion: {question}"

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        max_tokens=250
    )

    return response.choices[0].message.content.strip()

# üîé Quick Test
sample_q = "What significant invention did NVIDIA create in 1999?"
sample_ctx = retrieve_contexts(sample_q, top_k=1)[0]
out = generate_answer_llm(sample_q, context=sample_ctx)
print(json.dumps({"question": sample_q, "answer": out}, indent=2))


{
  "question": "What significant invention did NVIDIA create in 1999?",
  "answer": "NVIDIA created the GPU (Graphics Processing Unit) in 1999."
}


In [None]:
# Batch QA Generation For 50 Q & A Using GPT-4o


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

INPUT_PATH  = "/Users/pulinkumar/Desktop/ALGOVERSE/Financial-Agent-Questions.csv"
OUTPUT_PATH = "/Users/pulinkumar/Desktop/ALGOVERSE/Financial-Agent-GPT4o-QA.csv"

generated_answers = []
BATCH_SIZE = 50    # Change to 100 or 200 for longer runs
START_INDEX = 0    # Change if resuming from checkpoint

df_slice = df.iloc[START_INDEX:START_INDEX+BATCH_SIZE].copy()

for i, row in tqdm(df_slice.iterrows(), total=len(df_slice)):
    q = row["question"]
    ctx = retrieve_contexts(q, top_k=1)[0]
    ans = generate_answer_llm(q, context=ctx, model="gpt-4o")
    generated_answers.append(ans)

# Attach predictions to sliced dataframe
df_slice["gpt4o_pred"] = generated_answers

# Save batch results (won‚Äôt break full CSV)
output_file = OUTPUT_PATH.replace(".csv", f"_batch_{START_INDEX}.csv")
df_slice.to_csv(output_file, index=False)

print(f"üíæ Saved batch results to {output_file}")
print(f"‚úÖ Generated {len(df_slice)} predictions successfully.")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:05<00:00,  1.30s/it]

üíæ Saved batch results to /Users/pulinkumar/Desktop/ALGOVERSE/Financial-Agent-GPT4o-QA_batch_0.csv
‚úÖ Generated 50 predictions successfully.





In [None]:
#Used gpt-4o-mini LLM-as-a-Judge 
def llm_as_judge(question, true_answer, pred_answer, model="gpt-4o-mini"):
    """Use GPT-4o-mini to evaluate semantic similarity between true and predicted answers."""
    prompt = f"""
You are a financial QA evaluator. 
Compare the models predicted answer with the true answer and rate it 0.0 - 1.0 on:

1. Correctness ‚Äî factual alignment with true answer.
2. Completeness ‚Äî covers all key details.
3. Relevance ‚Äî focuses on the question asked.

Question: {question}

True Answer: {true_answer}
Predicted Answer: {pred_answer}

Return JSON only:
{{"score": 0.xx, "justification": "short explanation"}}
"""

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=150
        )
        text = response.choices[0].message.content.strip()
        result = json.loads(text)
        return result.get("score", 0.0), result.get("justification", "")
    except Exception as e:
        print("‚ö†Ô∏è Judge error:", e)
        return None, str(e)


In [None]:

# Batch Process Generation + Judging
judge_model = "gpt-4o-mini"
gen_model = "gpt-4o"

scores, reasons, answers = [], [], []
save_path = "/Users/pulinkumar/Desktop/ALGOVERSE/Financial-Agent-LLM-Judge-Results.csv"

for i, row in tqdm(df.head(50).iterrows(), total=50, disable=True):
    q, true_a = row["question"], row["answer"]

    # Retrieve context and generate LLM answer
    ctx = retrieve_contexts(q, top_k=1)[0]
    pred = generate_answer_llm(q, context=ctx, model=gen_model)

    # Evaluate with GPT-4o-mini
    s, r = llm_as_judge(q, true_a, pred, model=judge_model)

    answers.append(pred)
    scores.append(s)
    reasons.append(r)

    # Simple rate-limit management
    time.sleep(3)

    #Fixed checkpoint save every 10 iterations
    if (i + 1) % 10 == 0:
        partial_df = pd.DataFrame({
            "question": df.head(i + 1)["question"].values,
            "true_answer": df.head(i + 1)["answer"].values,
            "pred_answer": answers,
            "score": scores,
            "reason": reasons
        })
        partial_df.to_csv(save_path, index=False)
        print(f"üíæ Progress saved at sample {i+1}")

#Final Save
results_df = pd.DataFrame({
    "question": df.head(len(scores))["question"].values,
    "true_answer": df.head(len(scores))["answer"].values,
    "pred_answer": answers,
    "score": scores,
    "reason": reasons
})
results_df.to_csv(save_path, index=False)
print(f"\n Completed evaluation ‚Üí Saved to {save_path}")
print(f"Average Score: {np.nanmean([s for s in scores if s is not None]):.3f}")


üíæ Progress saved at sample 10
üíæ Progress saved at sample 20
üíæ Progress saved at sample 30
üíæ Progress saved at sample 40
üíæ Progress saved at sample 50

‚úÖ Completed evaluation ‚Üí Saved to /Users/pulinkumar/Desktop/ALGOVERSE/Financial-Agent-LLM-Judge-Results.csv
Average Score: 0.650
