In [71]:
import csv
from pathlib import Path
from typing import Any, Optional
from typing import Tuple
import time

import numpy as np
import pandas as pd
import vertexai
from numpy import floating
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

DATA_FOLDER = Path("../data")
INPUT_FOLDER = DATA_FOLDER / "query_eval"
RESULT_FOLDER = DATA_FOLDER / "embedding_eval"
RESULT_V2_FOLDER = (
    DATA_FOLDER / "embedding_eval_v2"
)  # this contains the API call with type (QUERY or DOCUMENT)
REGION = "europe-west3"
PROJECT_ID = "adesso-gcc-rtl-uc4"
EMBEDDING_MODELS = [
    ("gecko@001", "textembedding-gecko@001"),
    ("gecko@002", "textembedding-gecko@002"),
    ("gecko@003", "textembedding-gecko@003"),
    ("gecko@00multilingual", "textembedding-gecko-multilingual@001"),
]

vertexai.init(project=PROJECT_ID, location=REGION)

In [73]:
def load_csv(file: Path, col_name: str) -> list[str]:
    with open(file, newline="") as f:
        reader = csv.DictReader(f)
        return [
            row[col_name].replace("ChromaDB:\n", "") for row in reader if row[col_name]
        ]


def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            time.sleep(sleep_time)


def encode_texts_to_embeddings(
    docs: list[str | TextEmbeddingInput],
    embedding_model: TextEmbeddingModel,
    instances_per_batch: int = 5,
    requests_per_minute: int = 100,
) -> list[Optional[list[float]]]:
    # sentences = [TextEmbeddingInput(text=sentence) for sentence in sentences]
    try:
        embeddings = []
        limiter = rate_limit(requests_per_minute)
        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[:instances_per_batch],
                docs[instances_per_batch:],
            )
            chunk = embedding_model.get_embeddings(head)
            embeddings.extend(chunk)
            next(limiter)
        return [embedding.values for embedding in embeddings]
    except Exception as e:
        print(f"Error while getting embeddings: {e}")
        return [None for _ in range(len(docs))]


def get_distance(document: list[float], query_embedding: list[float]) -> floating[Any]:
    return np.linalg.norm(np.array(document) - np.array(query_embedding))


def get_text_embedding_df(
    docs: list[str], query: str, embedding_models: list[Tuple[str, str]]
) -> pd.DataFrame:
    df = pd.DataFrame()
    df["result"] = docs

    for embedding_model_name, embedding_model in embedding_models:
        model = TextEmbeddingModel.from_pretrained(embedding_model)

        # get query embedding
        query_embedding = model.get_embeddings([query])[0].values

        # get text embeddings
        df[embedding_model_name] = encode_texts_to_embeddings(docs, model)  # type: ignore

        # get distance for each document embedding to the query embedding
        distance_col = f"{embedding_model_name}_distance"
        df[distance_col] = df[embedding_model_name].map(
            lambda x: round(get_distance(x, query_embedding), 2)
        )

        # get order of the distance
        df[f"{embedding_model_name}_order"] = df[distance_col].rank().astype(int)
    return df

In [None]:
COL_NAME = "Semantic"

for csv_file in INPUT_FOLDER.glob("*.csv"):
    # query is in file name
    query = " ".join(csv_file.stem.split("_"))
    print(f"Processing {csv_file} with query: {query}")

    # load texts
    docs = load_csv(csv_file, COL_NAME)

    # get df with embeddings and distances to query
    df = get_text_embedding_df(docs, query, EMBEDDING_MODELS)
    df.to_parquet(RESULT_FOLDER / f"{csv_file.stem}.parquet", index=False)

In [69]:
# clean up the CSV by removing embedding columns and re-ordering results
for result_file in RESULT_FOLDER.glob("*.parquet"):
    df = pd.read_parquet(result_file)

    # drop all columns that don't contain "distance" or "order"
    df = df.loc[:, df.columns.str.contains("distance|order|result")]

    # copy index (+ 1) to new column "rank"
    original_rank_col = "original_rank"
    df[original_rank_col] = df.index + 1

    # sort texts by each distance order (distance) column
    for col in df.columns:
        if col.endswith("_order"):
            result_col = col.replace("_order", "")
            helper_df = df[["result", original_rank_col, col]].sort_values(by=col)
            df[result_col + "_result"] = helper_df["result"].values
            df[col] = helper_df[col].values

    df = df.reindex(sorted(df.columns), axis=1)
    df.to_csv(RESULT_FOLDER / (result_file.stem + ".csv"), index=False)