In [26]:
!pip -q install sentence-transformers tqdm
!pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.20.1-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.20.1-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tf-keras
Successfully installed tf-keras-2.20.1


In [27]:
from pathlib import Path
import json
from tqdm import tqdm

CHUNK_FILE = Path("data/chunks/iea_ev_outlook_chunks.jsonl")
OUT_DIR = Path("data/embeddings")
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_FILE = OUT_DIR / "iea_ev_outlook_embeddings_local.jsonl"
assert CHUNK_FILE.exists(), "Chunk file not found."

chunks = []
with CHUNK_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        chunks.append(json.loads(line))

print("✅ Loaded chunks:", len(chunks))


✅ Loaded chunks: 352


In [28]:
from sentence_transformers import SentenceTransformer

MODEL_NAME = "BAAI/bge-small-en-v1.5"   # excellent + fast
model = SentenceTransformer(MODEL_NAME)
print("✅ Loaded embedding model:", MODEL_NAME)


✅ Loaded embedding model: BAAI/bge-small-en-v1.5


In [29]:
def load_done_ids(out_file: Path):
    done = set()
    if not out_file.exists():
        return done
    with out_file.open("r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            done.add(obj["chunk_id"])
    return done

done_ids = load_done_ids(OUT_FILE)
print("Already embedded:", len(done_ids))

count_new = 0

with OUT_FILE.open("a", encoding="utf-8") as out_f:
    for obj in tqdm(chunks, desc="Embedding chunks (local)"):
        chunk_id = obj["chunk_id"]
        if chunk_id in done_ids:
            continue
        
        text = obj["text"]
        emb = model.encode(text, normalize_embeddings=True).tolist()

        out_record = {
            "chunk_id": chunk_id,
            "source": obj["source"],
            "year": obj["year"],
            "domain": obj["domain"],
            "text": text,
            "embedding": emb
        }

        out_f.write(json.dumps(out_record, ensure_ascii=False) + "\n")
        count_new += 1

print("✅ New embeddings created:", count_new)
print("✅ Output file:", OUT_FILE)



Already embedded: 0


Embedding chunks (local): 100%|██████████| 352/352 [00:14<00:00, 23.54it/s]

✅ New embeddings created: 352
✅ Output file: data/embeddings/iea_ev_outlook_embeddings_local.jsonl





In [30]:
import json

n = 0
dim = None
with OUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        n += 1
        if dim is None:
            dim = len(obj["embedding"])

print("✅ Total vectors:", n)
print("✅ Embedding dimension:", dim)


✅ Total vectors: 352
✅ Embedding dimension: 384
