In [1]:
# lang chain
from langchain_openai import AzureOpenAIEmbeddings

# system
import os
import csv

# Vector DB
from docarray.index import InMemoryExactNNIndex

# utils
from utils.models import VecDBEntry
from utils.db import search_db

# numerics
import numpy as np



In [2]:
os.environ["AZURE_OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
os.environ["AZURE_OPENAI_ENDPOINT"] = os.environ["OPENAI_API_BASE"]
os.environ["AZURE_OPENAI_API_VERSION"] = os.environ["OPENAI_API_VERSION"]
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = "firstcontact-gpt4-turbo"

del os.environ["OPENAI_API_BASE"]

In [3]:
embeddings_model = AzureOpenAIEmbeddings(
    azure_deployment="firstcontact-embeddings",
    model="gpt-4-128k"
)

## Perform Embedding

In [4]:
if not os.path.isfile('embeddings.csv'):
    entries = []
    with open('./terms.csv') as fp:
        reader = csv.reader(fp)
        for row, (term, label, scope) in enumerate(reader):
            if row==0:
                continue
            entries.append((term, label, scope))

    to_embed = list(map(lambda x: x[1], entries))
    embedded = embeddings_model.embed_documents(to_embed)

    with open('embeddings.csv', 'w+') as fp:
        writer = csv.writer(fp)
        writer.writerow(['term', 'label', 'scope', 'embedding'])
        for embedding, (term, label, scope) in zip(embedded, entries):
            writer.writerow([term, label, scope, embedding])

In [5]:
docs = []

with open('./embeddings.csv') as fp:
    reader = csv.reader(fp)
    for row, (term, label, scope, embedding) in enumerate(reader):
        if row==0:
            continue
        embedding = eval(embedding)
        docs.append(VecDBEntry(term=term, label=label, scope=scope, embedding=embedding))

In [6]:
docs[0]

## Index the Terms

In [7]:
db = InMemoryExactNNIndex[VecDBEntry]()
db.index(docs)

## Find Term

In [8]:
query = "renal failure"

matches, scores = search_db(db, query, embeddings_model)
print(f"{matches=}")
print(f"{matches.label=}")
print(f"{matches.term=}")
print(f"{scores=}")

matches=<DocList[VecDBEntry] (length=3)>
matches.label=['Renal failure', 'Alcoholic liver damage', 'Hyperkalaemia']
matches.term=['SNOMED:42399005', 'SNOMED:41309000', 'SNOMED:14140009']
scores=array([0.91019983, 0.81845957, 0.80818007])
