In [1]:
!pip -q install faiss-cpu


In [2]:
from pathlib import Path
import json
import numpy as np
from tqdm import tqdm

EMB_FILE = Path("data/embeddings/iea_ev_outlook_embeddings_local.jsonl")
assert EMB_FILE.exists(), "Embeddings file not found. Run embeddings.ipynb first."

VDB_DIR = Path("vector_db")
VDB_DIR.mkdir(parents=True, exist_ok=True)

INDEX_FILE = VDB_DIR / "iea_faiss.index"
META_FILE  = VDB_DIR / "iea_metadata.jsonl"

print("✅ Input:", EMB_FILE)
print("✅ Index output:", INDEX_FILE)
print("✅ Metadata output:", META_FILE)


✅ Input: data/embeddings/iea_ev_outlook_embeddings_local.jsonl
✅ Index output: vector_db/iea_faiss.index
✅ Metadata output: vector_db/iea_metadata.jsonl


In [3]:
vectors = []
metadata = []

with EMB_FILE.open("r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading embeddings"):
        obj = json.loads(line)
        vectors.append(obj["embedding"])
        metadata.append({
            "chunk_id": obj["chunk_id"],
            "source": obj["source"],
            "year": obj["year"],
            "domain": obj["domain"],
            "text": obj["text"]
        })

vectors = np.array(vectors, dtype="float32")
print("✅ Loaded vectors:", vectors.shape)


Loading embeddings: 352it [00:00, 8742.69it/s]

✅ Loaded vectors: (352, 384)





In [4]:
import faiss

dim = vectors.shape[1]
index = faiss.IndexFlatIP(dim)   # inner product (cosine if normalized)

index.add(vectors)

print("✅ FAISS index built")
print("Total vectors indexed:", index.ntotal)


✅ FAISS index built
Total vectors indexed: 352


In [5]:
faiss.write_index(index, str(INDEX_FILE))
print("✅ Saved FAISS index:", INDEX_FILE)

with META_FILE.open("w", encoding="utf-8") as f:
    for m in metadata:
        f.write(json.dumps(m, ensure_ascii=False) + "\n")

print("✅ Saved metadata:", META_FILE)


✅ Saved FAISS index: vector_db/iea_faiss.index
✅ Saved metadata: vector_db/iea_metadata.jsonl


In [6]:
from sentence_transformers import SentenceTransformer

MODEL_NAME = "BAAI/bge-small-en-v1.5"
embedder = SentenceTransformer(MODEL_NAME)

def search(query, k=5):
    qv = embedder.encode(query, normalize_embeddings=True).astype("float32")
    D, I = index.search(np.array([qv]), k)
    return D[0], I[0]

query = "global EV sales growth 2023"
scores, idxs = search(query, k=5)

for rank, (score, idx) in enumerate(zip(scores, idxs), 1):
    print(f"\n#{rank} | score={score:.4f} | chunk_id={metadata[idx]['chunk_id']} | year={metadata[idx]['year']}")
    print(metadata[idx]["text"][:400], "...")


  from .autonotebook import tqdm as notebook_tqdm



#1 | score=0.8736 | chunk_id=IEA_2023_000010 | year=2023
is expected to expand. Box 1.1 The 2023 outlook for electric cars is bright Early indications from first quarter sales of 2023 point to an upbeat market, supported by cost declines as well as strengthened policy support in key markets such as the United States. Globally, our current estimate is therefore for nearly 14 million electric cars to be sold in 2023, building on the more than 2.3 million  ...

#2 | score=0.8597 | chunk_id=IEA_2024_000042 | year=2024
demand, with fastest growth in 2023 in the United States and Europe The growth in EV sales is pushing up demand for batteries, continuing the upward trend of recent years. Demand for EV batteries reached more than 750 GWh in 2023, up 40% relative to 2022, though the annual growth rate slowed slightly compared to in 2021-2022. Electric cars account for 95% of this growth. Globally, 95% of the growt ...

#3 | score=0.8524 | chunk_id=IEA_2023_000007 | year=2023
which has a 50% 