In [1]:
import json
from pathlib import Path
from pprint import pprint
from typing import Dict, List, Tuple

import faiss
import numpy as np
import pandas as pd
import tiktoken
import yaml
from loguru import logger
from openai import OpenAI
from utils import OpenAIConfig

## Model tokenizers

In [10]:
models = [
    "gpt-4o",
    "gpt-4o-mini",
    "gpt-3.5-turbo",
    "text-embedding-3-large",
    "text-embedding-3-small",
    "text-embedding-ada-002",
]

In [12]:
{model: tiktoken.encoding_name_for_model(model_name=model) for model in models}

{'gpt-4o': 'o200k_base',
 'gpt-4o-mini': 'o200k_base',
 'gpt-3.5-turbo': 'cl100k_base',
 'text-embedding-3-large': 'cl100k_base',
 'text-embedding-3-small': 'cl100k_base',
 'text-embedding-ada-002': 'cl100k_base'}

## Config Loading

In [20]:
config_path = Path("./config.yaml")
assert config_path.exists(), f"File not found {config_path}"

In [31]:
openai_config = OpenAIConfig.load_config_yaml(config_path)

In [34]:
pprint(openai_config)

OpenAIConfig(url=URLConfig(embedding='https://api.openai.com/v1/embeddings',
                           chat='https://api.openai.com/v1/chat/completions'),
             max_attempts=5,
             logging_level=20,
             limits=LimitsConfig(requests_per_minute={'gpt_3_5_turbo': 3500,
                                                      'gpt_4o': 500,
                                                      'gpt_4o_mini': 500,
                                                      'text_embedding_3_large': 3000,
                                                      'text_embedding_3_small': 3000,
                                                      'text_embedding_ada_002': 3000},
                                 tokens_per_minute={'gpt_3_5_turbo': 200000,
                                                    'gpt_4o': 30000,
                                                    'gpt_4o_mini': 200000,
                                                    'text_embedding_3_large': 100000

## Parallel Calling

In [6]:
requests_file = Path("./requests_to_parallel_process.jsonl")

In [8]:
n_requests = 10
jobs = [
    {
        "model": "text-embedding-3-small",
        "input": str(x) + "\n",
        "metadata": {"id": x},
    }
    for x in range(n_requests)
]
with open(requests_file, "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")

In [None]:
def save_jsonl(jobs: List[Dict], file_path: Path) -> None:
    with open(file_path, "w") as f:
        for job in jobs:
            json_string = json.dumps(job)
            f.write(json_string + "\n")


def create_jobs(
    df: pd.DataFrame,
    model: str,
    file_path: Path,
    product_key: str = "product_text",
    id_key: str = "id",
) -> None:

    assert file_path.suffix == ".jsonl", ValueError("File path must be a JSONL file!")

    jobs = [
        {
            "model": model,
            "input": getattr(row, product_key),
            "metadata": {id_key: getattr(row, id_key)},
        }
        for row in df.itertuples()
    ]
    save_jsonl(jobs=jobs, file_path=file_path)


def load_results(results_path: Path) -> Tuple[pd.DataFrame, List[str]]:
    """
    Load results from a JSONL file and return a DataFrame.
    """
    assert results_path.exists(), FileNotFoundError("There is no results file!")
    assert results_path.suffix == ".jsonl", ValueError(
        "File path must be a JSONL file!"
    )

    embeddings = []
    fail_ids = []
    with open(results_path, "r", encoding="utf-8") as file:
        for line in file:
            try:
                data = json.loads(line)
                embedding = data[1]["data"][0]["embedding"]
                id = data[2]["id"]
                embeddings.append({"id": id, "embeddings": embedding})
            except Exception as e:
                fail_ids.append(id)
                logger.warning(f"JSON loads failed for ID: {id}, with exception: {e}")

    df = pd.DataFrame(embeddings)
    return df, fail_ids

## Terminal command

```bash
python dataset/api_request_parallel_processor.py \
  --requests_filepath dataset/example_requests_to_parallel_process.jsonl \
  --save_filepath examples/data/example_requests_to_parallel_process_results.jsonl \
  --request_url https://api.openai.com/v1/embeddings \
  --max_requests_per_minute 1500 \
  --max_tokens_per_minute 6250000 \
  --token_encoding_name cl100k_base \
  --max_attempts 5 \
  --logging_level 20
```

## Get Top K With Faiss

In [None]:
# Load embeddings from JSONL file
def load_embeddings(jsonl_file: Path, id_key: str = "id") -> np.ndarray:
    embeddings = []
    fail_ids = []
    with open(jsonl_file, "r", encoding="utf-8") as file:
        for line in file:
            id = None  # Initialize id before the try block
            try:
                data = json.loads(line)
                id = data[2][id_key]
                embedding = data[1]["data"][0]["embedding"]
                embeddings.append(embedding)
            except Exception as e:
                if id is not None:
                    fail_ids.append(id)
                logger.warning(f"JSON loads failed for ID: {id}, with exception: {e}")

    return np.array(embeddings).astype(
        "float32"
    )  # Convert to NumPy array of type float32


jsonl_file = Path("your_embeddings_file.jsonl")
embeddings = load_embeddings(jsonl_file)

In [None]:
# Initialize the FAISS index
dimension = embeddings.shape[1]  # Length of each vector
index = faiss.IndexFlatL2(dimension)  # Use L2 distance (Euclidean)

# Add the embeddings to the index
index.add(embeddings)

In [None]:
k = 100  # Number of nearest neighbors

# Perform the search
distances, indices = index.search(embeddings, k)

# distances: 2D array of shape (number of queries, k) containing distances
# indices: 2D array of shape (number of queries, k) containing indices of the nearest neighbors

Do not run this.

In [None]:
# Example: Save the results in a DataFrame
df_results = pd.DataFrame(
    {
        "query_index": np.repeat(np.arange(len(embeddings)), k),
        "neighbor_index": indices.flatten(),
        "distance": distances.flatten(),
    }
)

# Optionally, save the DataFrame to a file
df_results.to_csv("faiss_search_results.csv", index=False)