In [None]:
from __future__ import annotations

In [1]:
from dataclasses import dataclass
from typing import List

In [3]:
from constants import EMBEDDING_PROVIDER, EMBEDDING_MODEL, EMBEDDING_DIMENSIONS

In [4]:
class EmbeddingClient:
    """
    Simple embedding interface used by indexers/retrievers in this repo.
    """

    def encode_documents(self, texts: List[str]) -> List[List[float]]:
        raise NotImplementedError

    def encode_queries(self, texts: List[str]) -> List[List[float]]:
        return self.encode_documents(texts)

In [5]:
@dataclass
class LocalSentenceTransformerEmbeddings(EmbeddingClient):
    model_name: str = EMBEDDING_MODEL

    def __post_init__(self) -> None:
        from sentence_transformers import SentenceTransformer

        self._model = SentenceTransformer(self.model_name)

    def encode_documents(self, texts: List[str]) -> List[List[float]]:
        vectors = self._model.encode(
            texts,
            normalize_embeddings=True,
            convert_to_numpy=True,
            show_progress_bar=False,
        )
        # Ensure list[list[float]]
        return vectors.astype("float32").tolist()

In [6]:
def get_embedding_client() -> EmbeddingClient:
    """
    Factory based on EMBEDDING_PROVIDER.

    Note: Groq does not provide embeddings; use `openai` or `local`.
    """
    if EMBEDDING_PROVIDER == "local":
        return LocalSentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)

    # Default: OpenAI embeddings via pymilvus helper (requires OPENAI_API_KEY)
    from pymilvus.model.dense import OpenAIEmbeddingFunction

    return OpenAIEmbeddingFunction(model_name=EMBEDDING_MODEL, dimensions=EMBEDDING_DIMENSIONS)

In [7]:
client = get_embedding_client()

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import pickle

with open("2024_akademik_chunks_data.pkl", "rb") as f:
    loaded_texts = pickle.load(f)

In [11]:
outputs = client.encode_documents(loaded_texts)

In [12]:
for i, vector in enumerate(outputs):
        print(f"\n--- Output Kalimat ke-{i+1} ---")
        print(f"Jumlah Dimensi: {len(vector)}")
        print(f"5 Angka Pertama: {vector[:5]}...")


--- Output Kalimat ke-1 ---
Jumlah Dimensi: 384
5 Angka Pertama: [-0.0371372289955616, 0.0031697636004537344, 0.03584212064743042, -0.06642220169305801, -0.06723808497190475]...

--- Output Kalimat ke-2 ---
Jumlah Dimensi: 384
5 Angka Pertama: [-0.010801523923873901, 0.040990039706230164, 0.03509872406721115, -0.02685549668967724, -0.045461155474185944]...

--- Output Kalimat ke-3 ---
Jumlah Dimensi: 384
5 Angka Pertama: [-0.04911261424422264, 0.016192352399230003, 0.0510857030749321, -0.059807222336530685, -0.07693922519683838]...

--- Output Kalimat ke-4 ---
Jumlah Dimensi: 384
5 Angka Pertama: [-0.033774182200431824, 0.06827365607023239, -0.004047732800245285, -0.042902007699012756, -0.018914224579930305]...

--- Output Kalimat ke-5 ---
Jumlah Dimensi: 384
5 Angka Pertama: [-0.026193682104349136, -0.0362766832113266, -0.004814766347408295, -0.050724029541015625, -0.06578364968299866]...

--- Output Kalimat ke-6 ---
Jumlah Dimensi: 384
5 Angka Pertama: [-0.09645994752645493, 0.03027

In [16]:
import pickle

# --- MENYIMPAN ---
vectors_to_save = [vector for vector in outputs]

with open("2024_akademik_vectors.pkl", "wb") as f:
    pickle.dump(vectors_to_save, f)

print(f"Berhasil menyimpan {len(vectors_to_save)} vektor ke pkl.")

Berhasil menyimpan 18 vektor ke pkl.


In [17]:
import json

with open("2024_akademik_embedding_data.json", "w", encoding="utf-8") as f:
    json.dump(vectors_to_save, f, ensure_ascii=False, indent=4)
print("Data chunks berhasil disimpan ke json.")

Data chunks berhasil disimpan ke json.
