https://github.com/DrMikeSh/medical_research_agent/blob/main/batch_openAI/icd_batch_embedding.ipynb

https://github.com/openai/openai-cookbook/blob/main/examples/api_request_parallel_processor.py#

---
graphrag
https://github.com/microsoft/graphrag

In [1]:
import json
import os

import openai
import pandas as pd
import qdrant_client
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client.models import Distance, PointStruct, VectorParams

In [None]:
openai_api_key = os.environ.get("OPENAI_API_KEY")

embedding_model = "text-embedding-3-small"

batch_folder = "./batch_files"

In [None]:
openai_client = openai.Client(api_key=openai_api_key)


In [5]:
response = openai_client.embeddings.create(
    input="Your text string goes here", model=embedding_model
)

embedding_vector_length = len(response.data[0].embedding)
print(f"Embedding vector length: {embedding_vector_length}")

Embedding vector length: 1536


In [6]:
loader = PyPDFLoader("./PDFs/attention.pdf")
documents = loader.load()
print(len(documents))

15


In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024, chunk_overlap=128, add_start_index=True
)

In [8]:
chunks = text_splitter.split_documents(documents)
len(chunks)

49

In [26]:
chunks[0]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': './PDFs/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'start_index': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser

In [9]:
# Initialize an empty DataFrame with specific column names
df = pd.DataFrame(columns=["text", "source", "title", "page_label"])

# Loop through the data and append to the DataFrame
for chunk in chunks:
    # Option 1: Using loc
    df.loc[len(df)] = [
        chunk.page_content,
        chunk.metadata["source"],
        chunk.metadata["title"],
        chunk.metadata["page_label"],
    ]

In [10]:
# create the batch files
batch_size = 20000
batch_file = df.copy()
batch_file_name = "embedding_batch"
num_files = len(batch_file) // batch_size + 1
for num_file in range(num_files):
    output_file = f"./batch_files/{batch_file_name}_part{num_file}.jsonl"
    # make sure that the file does not exist, so don't add to an existing file
    if os.path.exists(output_file):
        os.remove(output_file)
    # write each embedding entry to a new line
    with open(output_file, "a") as file:
        for index, row in batch_file.iloc[
            batch_size * num_file : min(batch_size * (num_file + 1), len(batch_file))
        ].iterrows():
            payload = {
                "custom_id": f"custom_id_{index}",
                "method": "POST",
                "url": "/v1/embeddings",
                "body": {
                    "input": row["text"],
                    "model": embedding_model,
                    "encoding_format": "float",  # default is float
                    "dimensions": embedding_vector_length,
                },
            }
            file.write(json.dumps(payload) + "\n")

    # Sanity check, print the first 2 lines
    with open(output_file, "r") as file:
        for line in file.readlines()[:2]:
            print(line)

{"custom_id": "custom_id_0", "method": "POST", "url": "/v1/embeddings", "body": {"input": "Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani\u2217\nGoogle Brain\navaswani@google.com\nNoam Shazeer\u2217\nGoogle Brain\nnoam@google.com\nNiki Parmar\u2217\nGoogle Research\nnikip@google.com\nJakob Uszkoreit\u2217\nGoogle Research\nusz@google.com\nLlion Jones\u2217\nGoogle Research\nllion@google.com\nAidan N. Gomez\u2217 \u2020\nUniversity of Toronto\naidan@cs.toronto.edu\n\u0141ukasz Kaiser\u2217\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin\u2217 \u2021\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an att

In [None]:
batch_input_files = []
for file in os.listdir(batch_folder):
    batch_input_files.append(
        openai_client.files.create(
            file=open(f"{batch_folder}/{file}", "rb"), purpose="batch"
        )
    )

In [None]:
# create the batch job
batch_file_ids = [
    batch_file.id for batch_file in batch_input_files
]  # we get the ids of the batch files
job_creations = []
for i, file_id in enumerate(batch_file_ids):
    job_creations.append(
        openai_client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/embeddings",
            completion_window="24h",  # currently only 24h is supported
            metadata={"description": f"part_{i}_icd_embeddings"},
        )
    )

In [None]:
# WE can see here the jobs created, they start with validation
for job in job_creations:
    print(job)

# we extract the ids for the job to check the status
job_ids = [job.id for job in job_creations]

job_ids = ["batch_67cad006e9c48190a6c3027f08661291"]

Batch(id='batch_67cad006e9c48190a6c3027f08661291', completion_window='24h', created_at=1741344774, endpoint='/v1/embeddings', input_file_id='file-KMEM5AA2WdwahPAbSxBHiB', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1741431174, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'part_0_icd_embeddings'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [18]:
import time

fail_flag = False
finished = set()
while True:
    # we check the status of the jobs
    for job_id in job_ids:
        job = openai_client.batches.retrieve(job_id)
        if job.status == "failed":
            print(f"Job {job_id} has failed with error {job.errors}")
            fail_flag = True
            break
        elif job.status == "in_progress":
            print(
                f"Job {job_id} is in progress, {job.request_counts.completed}/{job.request_counts.total} requests completed"
            )
        elif job.status == "finalizing":
            print(f"Job {job_id} is finalizing, waiting for the output file id")
        elif job.status == "completed":
            print(f"Job {job_id} has finished")
            finished.add(job_id)
        else:
            print(f"Job {job_id} is in status {job.status}")

    if fail_flag is True or len(finished) == len(job_ids):
        break
    time.sleep(600)

# When the job is finished we can see the output file id that will be used to extract the output files
output_files_ids = []
if fail_flag is False:
    for job_id in job_ids:
        output_files_ids.append(openai_client.batches.retrieve(job_id).output_file_id)

Job batch_67cad006e9c48190a6c3027f08661291 has finished


In [19]:
for job_id in job_ids:
    job = openai_client.batches.retrieve(job_id)
    print(
        f"{job.request_counts.failed}/{job.request_counts.total} requests failed in job {job_id}"
    )

0/49 requests failed in job batch_67cad006e9c48190a6c3027f08661291


In [20]:
output_files = []
for output_file_id in output_files_ids:
    output_file = openai_client.files.content(output_file_id).text
    output_files.append(output_file)
    output_file_split = output_file.split("\n")
    print(len(output_file_split))

50


In [22]:
embedding_results = []
for file in output_files:
    for line in file.split("\n")[:-1]:
        data = json.loads(line)
        custom_id = data.get("custom_id")
        embedding = data["response"]["body"]["data"][0]["embedding"]
        embedding_results.append([custom_id, embedding])


embedding_results = pd.DataFrame(embedding_results, columns=["custom_id", "embedding"])

In [23]:
df = df.reset_index()
df = df.rename(columns={"index": "id"})
embedding_results["id"] = embedding_results["custom_id"].apply(
    lambda x: int(x.split("custom_id_")[1])
)
df_with_embedding = df.merge(
    embedding_results[["id", "embedding"]], on="id", how="left"
)

In [24]:
df_with_embedding.to_csv("./data/df_with_embedding_1536.csv", index=False)

In [25]:
df_with_embedding.head()

Unnamed: 0,id,text,source,title,page_label,embedding
0,0,"Provided proper attribution is provided, Googl...",./PDFs/attention.pdf,,1,"[0.024030453, 0.008008195, -0.030085, 0.018222..."
1,1,"based solely on attention mechanisms, dispensi...",./PDFs/attention.pdf,,1,"[0.0071760574, 0.012750796, 0.031213759, -0.00..."
2,2,∗Equal contribution. Listing order is random. ...,./PDFs/attention.pdf,,1,"[0.016491178, -0.021936744, -0.0035917556, -0...."
3,3,our research.\n†Work performed while at Google...,./PDFs/attention.pdf,,1,"[-0.0035893978, 0.024139408, -0.009138941, 0.0..."
4,4,"1 Introduction\nRecurrent neural networks, lon...",./PDFs/attention.pdf,,2,"[-0.014979424, -0.00016362563, 0.019091422, -0..."


In [17]:
texts = [chunk.page_content for chunk in chunks]


In [None]:
# texts = [
#     "Qdrant is the best vector search engine!",
#     "Loved by Enterprises and everyone building for low latency, high performance, and scale.",
# ]

In [None]:
qdrant_client = qdrant_client.QdrantClient("http://localhost:6333", timeout=600)

In [22]:
points = [
    PointStruct(
        id=idx,
        vector=data.embedding,
        payload={
            "text": chunk.page_content,
            "page": chunk.metadata["page"],
            "source": chunk.metadata["source"],
        },
    )
    for idx, (data, chunk) in enumerate(zip(result.data, chunks), start=1)
]

In [23]:
collection_name = "example_collection_3"

qdrant_client.create_collection(
    collection_name,
    vectors_config=VectorParams(
        size=1536,
        distance=Distance.COSINE,
    ),
)
qdrant_client.upsert(collection_name, points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)