In [86]:
import csv
from pathlib import Path
from typing import Any, Optional
from typing import Tuple
import time

from google.api_core.exceptions import InvalidArgument
import numpy as np
import pandas as pd
import vertexai
from numpy import floating
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

DATA_FOLDER = Path("../data")
INPUT_FOLDER = DATA_FOLDER / "query_eval"
RESULT_FOLDER = DATA_FOLDER / "embedding_eval"
RESULT_TASKTYPE_FOLDER = (
    DATA_FOLDER / "embedding_eval_task_type"
)  # this contains the API call with type (QUERY or DOCUMENT)
REGION = "europe-west3"
PROJECT_ID = "adesso-gcc-rtl-uc4"
EMBEDDING_MODELS = [
    ("gecko@001", "textembedding-gecko@001"),
    ("gecko@002", "textembedding-gecko@002"),
    ("gecko@003", "textembedding-gecko@003"),
    ("gecko@00multilingual", "textembedding-gecko-multilingual@001"),
]

vertexai.init(project=PROJECT_ID, location=REGION)

In [96]:
def load_csv(file: Path, col_name: str) -> list[str]:
    with open(file, newline="") as f:
        reader = csv.DictReader(f)
        return [
            row[col_name].replace("ChromaDB:\n", "") for row in reader if row[col_name]
        ]


def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            time.sleep(sleep_time)


def encode_texts_to_embeddings(
    docs: list[str | TextEmbeddingInput],
    embedding_model: TextEmbeddingModel,
    use_task_type: bool,
    instances_per_batch: int = 5,
    requests_per_minute: int = 100,
) -> list[Optional[list[float]]]:
    if use_task_type:
        docs = [
            TextEmbeddingInput(text=sentence, task_type="RETRIEVAL_DOCUMENT")  # type: ignore
            for sentence in docs
        ]
    try:
        embeddings = []
        limiter = rate_limit(requests_per_minute)
        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[:instances_per_batch],
                docs[instances_per_batch:],
            )
            chunk = embedding_model.get_embeddings(head)
            embeddings.extend(chunk)
            next(limiter)
        return [embedding.values for embedding in embeddings]
    except Exception as e:
        print(f"Error while getting embeddings: {e}")
        return [None for _ in range(len(docs))]


def get_distance(document: list[float], query_embedding: list[float]) -> floating[Any]:
    return np.linalg.norm(np.array(document) - np.array(query_embedding))


def get_text_embedding_df(
    docs: list[str],
    query: str,
    embedding_models: list[Tuple[str, str]],
    use_task_type: bool,
) -> pd.DataFrame:
    df = pd.DataFrame()
    df["result"] = docs

    for embedding_model_name, embedding_model in embedding_models:
        try:
            model = TextEmbeddingModel.from_pretrained(embedding_model)

            # get query embedding
            if use_task_type:
                query_textinput = TextEmbeddingInput(
                    text=query, task_type="RETRIEVAL_QUERY"
                )  # type: ignore
                query_embedding = model.get_embeddings([query_textinput])[0].values
            else:
                query_embedding = model.get_embeddings([query])[0].values

            # get text embeddings
            df[embedding_model_name] = encode_texts_to_embeddings(
                docs, model, use_task_type=use_task_type
            )  # type: ignore

            # get distance for each document embedding to the query embedding
            distance_col = f"{embedding_model_name}_distance"
            df[distance_col] = df[embedding_model_name].map(
                lambda x: round(get_distance(x, query_embedding), 2)
            )

            # get order of the distance
            df[f"{embedding_model_name}_order"] = df[distance_col].rank().astype(int)
        except InvalidArgument as e:
            print(f"Error while getting embeddings for {embedding_model_name}: {e}")
            continue
    return df

In [88]:
COL_NAME = "Semantic"


def create_evals(use_task_type: bool, result_folder: Path) -> None:
    # get embeddings and distances
    for csv_file in INPUT_FOLDER.glob("*.csv"):
        # query is in file name
        query = " ".join(csv_file.stem.split("_"))
        print(f"Processing {csv_file} with query: {query}")

        # load texts
        docs = load_csv(csv_file, COL_NAME)

        # get df with embeddings and distances to query
        df = get_text_embedding_df(
            docs, query, EMBEDDING_MODELS, use_task_type=use_task_type
        )
        df.to_parquet(result_folder / f"{csv_file.stem}.parquet", index=False)

    # clean up the CSV by removing embedding columns and re-ordering results
    for result_file in result_folder.glob("*.parquet"):
        df = pd.read_parquet(result_file)

        # drop all columns that don't contain "distance" or "order"
        df = df.loc[:, df.columns.str.contains("distance|order|result")]

        # copy index (+ 1) to new column "rank"
        original_rank_col = "original_rank"
        df[original_rank_col] = df.index + 1

        # sort texts by each distance order (distance) column
        for col in df.columns:
            if col.endswith("_order"):
                result_col = col.replace("_order", "")
                distance_col = f"{result_col}_distance"
                helper_df = df[
                    ["result", original_rank_col, col, distance_col]
                ].sort_values(by=col)
                df[result_col + "_result"] = helper_df["result"].values
                df[distance_col] = helper_df[distance_col].values
                df[col] = helper_df[col].values

        df = df.reindex(sorted(df.columns), axis=1)
        df.to_csv(result_folder / (result_file.stem + ".csv"), index=False)

In [81]:
create_evals(use_task_type=False, result_folder=RESULT_FOLDER)

Processing ../data/query_eval/Kinder_essen_in_deutschem_Kindergarten.csv with query: Kinder essen in deutschem Kindergarten
Processing ../data/query_eval/Robert_Habeck_lachend.csv with query: Robert Habeck lachend
Processing ../data/query_eval/Robert_Habeck_mit_positivem_Gesichtsausdruck,_z.B._lachend,_aber_ohne_Coronamaske.csv with query: Robert Habeck mit positivem Gesichtsausdruck, z.B. lachend, aber ohne Coronamaske
Processing ../data/query_eval/Nachstellung_Jugendgewalt.csv with query: Nachstellung Jugendgewalt
Processing ../data/query_eval/Menschen_trinken_Wasser.csv with query: Menschen trinken Wasser
Processing ../data/query_eval/Olaf_Scholz_hält_Rede_im.csv with query: Olaf Scholz hält Rede im
Processing ../data/query_eval/Fischmarkt_in_Hamburg_überflutet_nach_Sturmtief_Zoltan.csv with query: Fischmarkt in Hamburg überflutet nach Sturmtief Zoltan
Processing ../data/query_eval/Robert_Habeck_lachen.csv with query: Robert Habeck lachen
Processing ../data/query_eval/Fußgängerzone_

In [97]:
create_evals(use_task_type=True, result_folder=RESULT_TASKTYPE_FOLDER)

Processing ../data/query_eval/Kinder_essen_in_deutschem_Kindergarten.csv with query: Kinder essen in deutschem Kindergarten
Error while getting embeddings for gecko@001: 400 Task type RETRIEVAL_QUERY requested, but this model version does not support task-specific inference.
Processing ../data/query_eval/Robert_Habeck_lachend.csv with query: Robert Habeck lachend
Error while getting embeddings for gecko@001: 400 Task type RETRIEVAL_QUERY requested, but this model version does not support task-specific inference.
Processing ../data/query_eval/Robert_Habeck_mit_positivem_Gesichtsausdruck,_z.B._lachend,_aber_ohne_Coronamaske.csv with query: Robert Habeck mit positivem Gesichtsausdruck, z.B. lachend, aber ohne Coronamaske
Error while getting embeddings for gecko@001: 400 Task type RETRIEVAL_QUERY requested, but this model version does not support task-specific inference.
Processing ../data/query_eval/Nachstellung_Jugendgewalt.csv with query: Nachstellung Jugendgewalt
Error while getting em