In [1]:
import numpy, pandas
print("NumPy:", numpy.__version__)
print("Pandas:", pandas.__version__)


NumPy: 1.26.4
Pandas: 2.2.2


In [2]:
import torch, faiss, chromadb, sentence_transformers
print("Torch:", torch.__version__)
print("FAISS:", faiss.__version__)
print("Chroma:", chromadb.__version__)


  from tqdm.autonotebook import tqdm, trange


Torch: 2.2.2
FAISS: 1.8.0
Chroma: 0.5.5


In [1]:
import os, re, json, math, pickle, uuid, glob, time
from pathlib import Path

import numpy as np
import pandas as pd

import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
from sentence_transformers.util import cos_sim

# Optional: ChromaDB (persistent vector store)
import chromadb
from chromadb.config import Settings

print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)


  from tqdm.autonotebook import tqdm, trange


NumPy: 1.26.4
Pandas: 2.2.2


In [4]:
import pandas as pd
from tqdm import tqdm
import numpy as np

# Load your financial QA dataset
data = pd.read_csv("/Users/pulinkumar/Desktop/ALGOVERSE/Financial-QA-10k.csv")
data = data.fillna("")
print("Dataset size:", data.shape)
print(data.head(10))


Dataset size: (7000, 5)
                                            question  \
0  What area did NVIDIA initially focus on before...   
1  What are some of the recent applications of GP...   
2  What significant invention did NVIDIA create i...   
3  How does NVIDIA's platform strategy contribute...   
4  What does NVIDIA's CUDA programming model enable?   
5  What industries use NVIDIA's GPUs and software...   
6  Why did NVIDIA and SoftBank terminate their Sh...   
7  What amount did NVIDIA record as an acquisitio...   
8  What does the NVIDIA computing platform focus ...   
9  What are the key components of the NVIDIA comp...   

                                              answer  \
0           NVIDIA initially focused on PC graphics.   
1  Recent applications of GPU-powered deep learni...   
2                   NVIDIA invented the GPU in 1999.   
3  NVIDIA's platform strategy brings together har...   
4  NVIDIA's CUDA programming model opened the par...   
5  NVIDIA's GPUs and so

In [None]:
#imports & environment check
import os, re, json
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline

print("✅ Environment OK")
print("Torch:", torch.__version__)
print("FAISS:", faiss.__version__)


✅ Environment OK
Torch: 2.2.2
FAISS: 1.8.0


In [None]:
#setup
CSV_PATH = "/Users/pulinkumar/Desktop/ALGOVERSE/Financial-QA-10k.csv"

df = pd.read_csv(CSV_PATH).fillna("")
print("✅ Dataset loaded — shape:", df.shape)
print(df.head(5))


✅ Dataset loaded — shape: (7000, 5)
                                            question  \
0  What area did NVIDIA initially focus on before...   
1  What are some of the recent applications of GP...   
2  What significant invention did NVIDIA create i...   
3  How does NVIDIA's platform strategy contribute...   
4  What does NVIDIA's CUDA programming model enable?   

                                              answer  \
0           NVIDIA initially focused on PC graphics.   
1  Recent applications of GPU-powered deep learni...   
2                   NVIDIA invented the GPU in 1999.   
3  NVIDIA's platform strategy brings together har...   
4  NVIDIA's CUDA programming model opened the par...   

                                             context ticker    filing  
0  Since our original focus on PC graphics, we ha...   NVDA  2023_10K  
1  Some of the most recent applications of GPU-po...   NVDA  2023_10K  
2  Our invention of the GPU in 1999 defined moder...   NVDA  2023_10K  
3 

In [11]:
def detect_column(candidates):
    """Return the first matching column name for any candidate keyword."""
    for c in candidates:
        for col in df.columns:
            if c.lower() in col.lower().replace(" ", ""):
                return col
    return None

text_col     = detect_column(["context", "text", "passage", "body", "document"])
question_col = detect_column(["question", "query"])
answer_col   = detect_column(["answer", "gold", "label"])

print("Detected columns:")
print("TEXT:", text_col)
print("QUESTION:", question_col)
print("ANSWER:", answer_col)


Detected columns:
TEXT: context
QUESTION: question
ANSWER: answer


In [12]:
# ====== CONFIG ======
CSV_PATH = "/Users/pulinkumar/Desktop/ALGOVERSE/Financial-QA-10k.csv"

SAVE_DIR = "outputs"                   # where to save index & artifacts
EMBED_MODEL = "all-MiniLM-L6-v2"       # fast, CPU-friendly
BATCH_SIZE = 256                       # lower if RAM is tight (128/64)
TOP_K = 3                              # how many passages to retrieve
MIN_CONSISTENCY = 0.35                 # low-consistency warning threshold

# paths for artifacts
FAISS_INDEX_PATH = f"{SAVE_DIR}/faiss.index"
TEXTS_PATH = f"{SAVE_DIR}/texts.npy"


In [13]:
import os, re, json, string
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline

print("✅ Torch:", torch.__version__, "| ✅ FAISS:", faiss.__version__)
os.makedirs(SAVE_DIR, exist_ok=True)


✅ Torch: 2.2.2 | ✅ FAISS: 1.8.0


In [None]:
#load dataset
df = pd.read_csv(CSV_PATH).fillna("")
print("✅ Dataset:", df.shape)
df.head(5)


✅ Dataset: (7000, 5)


Unnamed: 0,question,answer,context,ticker,filing
0,What area did NVIDIA initially focus on before...,NVIDIA initially focused on PC graphics.,"Since our original focus on PC graphics, we ha...",NVDA,2023_10K
1,What are some of the recent applications of GP...,Recent applications of GPU-powered deep learni...,Some of the most recent applications of GPU-po...,NVDA,2023_10K
2,What significant invention did NVIDIA create i...,NVIDIA invented the GPU in 1999.,Our invention of the GPU in 1999 defined moder...,NVDA,2023_10K
3,How does NVIDIA's platform strategy contribute...,NVIDIA's platform strategy brings together har...,"NVIDIA has a platform strategy, bringing toget...",NVDA,2023_10K
4,What does NVIDIA's CUDA programming model enable?,NVIDIA's CUDA programming model opened the par...,With our introduction of the CUDA programming ...,NVDA,2023_10K


In [None]:
#quick sanity: columns already detected
text_col, question_col, answer_col = "context", "question", "answer"
assert text_col in df.columns, "context column missing"
assert question_col in df.columns, "question column missing"
assert answer_col in df.columns, "answer column missing"


In [None]:
#lightweight domain tagging (helps analysis/routing later)
def classify_domain(text: str) -> str:
    t = text.lower()
    if any(w in t for w in ["loan","mortgage","credit card","bank","deposit","savings","apr"]):
        return "Banking"
    if any(w in t for w in ["stock","equity","ipo","dividend","portfolio","market","etf","alpha","beta"]):
        return "Investment"
    if any(w in t for w in ["policy","premium","claim","underwriting","deductible","actuary"]):
        return "Insurance"
    if any(w in t for w in ["fintech","digital wallet","upi","psp","api","mobile app","neo-bank"]):
        return "Fintech"
    return "General"

df["domain"] = df[text_col].map(classify_domain)
df["domain"].value_counts()


domain
General       5260
Investment    1066
Banking        306
Fintech        192
Insurance      176
Name: count, dtype: int64

In [None]:
#build embeddings (batched) & FAISS cosine index
# 5.1 embedder
embedder = SentenceTransformer(EMBED_MODEL)

# 5.2 texts as ndarray (saves/read fast)
texts = df[text_col].astype(str).values
np.save(TEXTS_PATH, texts)

# 5.3 batched encode with L2-normalization (so inner product == cosine)
all_vecs = []
for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Embedding"):
    batch = texts[i:i+BATCH_SIZE].tolist()
    vecs = embedder.encode(
        batch,
        convert_to_numpy=True,
        normalize_embeddings=True,  # unit vectors
        show_progress_bar=False
    ).astype("float32")
    all_vecs.append(vecs)

embeddings = np.vstack(all_vecs)
print("✅ Embeddings:", embeddings.shape, embeddings.dtype)

# 5.4 FAISS index: inner product (with normalized vectors = cosine)
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeddings)
faiss.write_index(index, FAISS_INDEX_PATH)
print("✅ FAISS index built:", index.ntotal)


Embedding: 100%|██████████| 28/28 [07:07<00:00, 15.25s/it]

✅ Embeddings: (7000, 384) float32
✅ FAISS index built: 7000





In [None]:
#retrieval helper
def retrieve(query: str, top_k: int = TOP_K):
    qv = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    sim, idx = index.search(qv, top_k)
    idx, sim = idx[0].tolist(), sim[0].tolist()
    return [{"score": float(s), "text": texts[i]} for i, s in zip(idx, sim)]


In [None]:
#cpu-friendly extractive QA model
try:
    qa_pipe = pipeline("question-answering", model="deepset/roberta-base-squad2", device=-1)
except Exception:
    qa_pipe = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", device=-1)

def answer_extractive(question: str, contexts: list):
    best = {"answer": "", "score": -1.0, "context": ""}
    for ctx in contexts:
        out = qa_pipe(question=question, context=ctx)
        if out["score"] > best["score"]:
            best = {"answer": out["answer"], "score": float(out["score"]), "context": ctx}
    return best


Device set to use cpu


In [None]:
#semantic consistency score
def consistency_score(answer: str, contexts: list) -> float:
    if not answer.strip():
        return 0.0
    a = embedder.encode([answer], convert_to_numpy=True, normalize_embeddings=True)
    ctxs = embedder.encode(contexts, convert_to_numpy=True, normalize_embeddings=True)
    # mean cosine similarity (inner product on unit vectors)
    sims = (a @ ctxs.T).ravel()
    return float(np.mean(sims)) if len(sims) else 0.0


In [None]:
#end-to-end ask()
def ask(question: str, k: int = TOP_K, min_consistency: float = MIN_CONSISTENCY):
    hits = retrieve(question, top_k=k)
    contexts = [h["text"] for h in hits]
    qa = answer_extractive(question, contexts)
    cscore = consistency_score(qa["answer"], contexts)

    result = {
        "question": question,
        "answer": qa["answer"],
        "confidence": round(qa["score"], 3),
        "consistency": round(cscore, 3),
        "citations": [{"score": round(h["score"],3), "snippet": h["text"][:300]} for h in hits]
    }
    if cscore < min_consistency:
        result["note"] = "⚠️ Low semantic consistency with retrieved evidence. Please verify."
    return result


In [None]:
#quick test
sample_q = "How do interest rate hikes affect bond prices?"
out = ask(sample_q)
print(json.dumps(out, indent=2))


{
  "question": "How do interest rate hikes affect bond prices?",
  "answer": "higher market interest rates offered for retail deposits",
  "confidence": 0.335,
  "consistency": 0.645,
  "citations": [
    {
      "score": 0.514,
      "snippet": "In addition, economic conditions and actions by policymaking bodies are contributing to changing interest rates and significant capital market volatility, which, along with any increases in our borrowing levels, could increase our future borrowing costs."
    },
    {
      "score": 0.505,
      "snippet": "The increase in interest rates paid on our deposits were primarily due to the impact of higher market interest rates offered for retail deposits."
    },
    {
      "score": 0.496,
      "snippet": "Interest expense increased, primarily driven by higher interest rates paid on customer deposits."
    }
  ]
}


In [24]:
#evaluate on your Q/A columns (EM & F1)
def normalize_text(s):
    s = s.lower().strip()
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub(r"\s+", " ", s)
    return s

def exact_match(pred, gold):
    return int(normalize_text(pred) == normalize_text(gold))

def f1_score(pred, gold):
    p, g = normalize_text(pred).split(), normalize_text(gold).split()
    if not p and not g: return 1.0
    if not p or not g:  return 0.0
    # count overlap
    pc, gc = pd.Series(p).value_counts(), pd.Series(g).value_counts()
    num_same = int((pc & gc).sum())
    if num_same == 0: return 0.0
    precision = num_same / len(p)
    recall    = num_same / len(g)
    return 2 * precision * recall / (precision + recall)

subset = df[[question_col, answer_col]].sample(min(500, len(df)), random_state=42)

ems, f1s = [], []
for _, row in tqdm(subset.iterrows(), total=len(subset), desc="Evaluating"):
    pred = ask(row[question_col])["answer"]
    gold = row[answer_col]
    ems.append(exact_match(pred, gold))
    f1s.append(f1_score(pred, gold))

print(f"✅ Eval on {len(subset)} samples | EM: {np.mean(ems):.3f} | F1: {np.mean(f1s):.3f}")


  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = int((pc & gc).sum())
  num_same = i

✅ Eval on 500 samples | EM: 0.162 | F1: 0.309



