In [25]:
# imports libs
import openai
from dotenv import dotenv_values
# config .env and import api_key
config = dotenv_values('.env')
openai.api_key = config['OPENAI_API_KEY']

In [26]:
# import more libs
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type
import pickle
import tiktoken


In [27]:
# locate filepath, import test data
dataset_path_test = "./source_data/Problem_Intake_CurrentVers_TEST.csv"
# dataset_path = "./source_data/clinicalNeedsDB_data_main.xlsx"
dataset_path = "./source_data/clinicalNeedsDB_data_main.csv"

df = pd.read_csv(dataset_path_test)

# Potential issues with encoding excel import
# df_main = pd.read_excel(dataset_path)

df_main = pd.read_csv(dataset_path, nrows=397)


In [28]:
# look at imported df
df_main.head(1)

Unnamed: 0,Problem ID,Parent ID,Child ID,Date Recorded,Recorded By,Problem Tag,Contact,Date Originated,Department/Division,Intake Source,Unstructured Problem Statement,Problem To Address,Patient Population,Care Setting,Outcome,Problem Summary,Archived,Tracked
0,1,0,0,6/3/2020,nlwiley@ad.unc.edu,MedGen-0001-0000,"Gomez, Shawn",2018/10/01,Medicine - General,FastTraCS unprompted ideation,"""Tinder"" for clinical trials & research networ...",,,,,"""Tinder"" for clinical trials & research networ...",True,False


In [29]:
# sort df by most recent
# needs = df.sort_values("date", ascending=False)

# sort df by most recent and drop NaNs
needs = df_main.sort_values("Date Originated", ascending=False).dropna(subset=["Problem Summary", "Date Originated", "Department/Division"])

In [30]:
# set enc var for OpenAI
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [31]:
# pull need statements into array
need_statements = needs["Problem Summary"].values

In [32]:
# calc ttotal okens from needs statements array
total_tokens = sum([len(enc.encode("Problem Summary")) for needs in need_statements])

# def num_tokens_from_string(string: str, encoding_name: str) -> int:
#     """Returns the number of tokens in a text string."""
#     encoding = tiktoken.get_encoding(encoding_name)
#     num_tokens = len(encoding.encode(string))
#     return num_tokens



In [33]:
total_tokens
# reference OpenAI docs for model price / 1k tokens
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.6f}")

Estimated cost $0.000314


In [34]:
# decorator function to query more responsibly, from tenacity
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(openai.InvalidRequestError))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance
    if "\n" in text:
        text = text.replace("\n", " ")

    # call OpenAI embeddings API
    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [35]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding saved as a pickle file

# set path to embedding cache, test needs.pkl
embedding_cache_path = "needs_main.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path ,"wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embeddings_from_string(
        string,
        model="text-embedding-ada-002",
        embedding_cache=embedding_cache
):
    """Return embedding of a given a string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPEN AI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]


In [36]:
# Retreieve embeddings for data, if cache present, otherwise query OpenAI 
need_embeddings = [embeddings_from_string(need, model="text-embedding-ada-002") for need in need_statements]

GOT EMBEDDING FROM OPEN AI FOR Esophageal spasms (p
GOT EMBEDDING FROM OPEN AI FOR Risk of perforation 
GOT EMBEDDING FROM OPEN AI FOR GI auxillary tools (
GOT EMBEDDING FROM OPEN AI FOR Foreign object got s
GOT EMBEDDING FROM OPEN AI FOR Foreign object could
GOT EMBEDDING FROM OPEN AI FOR Drug packets cannot 
GOT EMBEDDING FROM OPEN AI FOR Unable to identify m
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Orignal Solution Pit
GOT EMBEDDING FROM OPEN AI FOR Orignal Solution Pit
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDING FROM OPEN AI FOR Original Solution Pi
GOT EMBEDDIN

In [37]:
# convert df to python dict
# data = needs[["contact", "dept", "need"]].to_dict("records")
data = needs[["Problem Summary", "Date Originated", "Department/Division"]].to_dict("records")

In [38]:
# import nomic lib
from nomic import atlas

In [45]:
# send embeddings with Atlas
project = atlas.map_embeddings(
    embeddings=np.array(need_embeddings),
    data=data,
    name="Clinical Needs",
    reset_project_if_exists=True
)

# map = project.get_map('Clinical Needs')
# print(map)
# project.create_index(name=project.name, build_topic_model=True, topic_label_field='text')
# print(project.maps[0])

[32m2023-05-14 21:27:28.896[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `Clinical Needs` in organization `akantunc`[0m
[32m2023-05-14 21:27:30.296[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
1it [00:01,  1.83s/it]
[32m2023-05-14 21:27:32.140[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-05-14 21:27:32.141[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-05-14 21:27:33.222[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `Clinical Needs` in project `Clinical Needs`: https://atlas.nomic.ai/map/763265c3-9020-45b5-ad8a-d6dd8c70e88b/61d1889d-5c94-471c-bb7c-d2f2355f270e[0m
[32m2023-05-14 21:27:33.223[0m | [1mINFO    [0m | [36mnomic.atl

Clinical Needs: https://atlas.nomic.ai/map/763265c3-9020-45b5-ad8a-d6dd8c70e88b/61d1889d-5c94-471c-bb7c-d2f2355f270e


In [46]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [59]:
def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embeddings-ada-002"
):
    # Get all embeddings
    embeddings = [embeddings_from_string(string) for string in strings]
    # get embedding for our specific query string
    query_embedding = embeddings[index_of_source_string]
    # get distances between our embedding and all others
    distances = distances_from_embeddings(query_embedding, embeddings)
    # get indices of the knn
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    # loop over indices to match
    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        if query_string == strings[i]:
            continue
        if match_count >= k_nearest_neighbors:
            break
        match_count =+ 1
        print(f"Found {match_count} closest match: ")
        print(f"Distances of: {distances[i]} ")
        print(strings[i])

In [62]:
print_recommendations_from_strings(need_statements, 312, k_nearest_neighbors=3, model="text-embeddings-ada-002")

Found 1 closest match: 
Distances of: 0.07506885107945171 
MIGS (hysterectomy), the top of the vagina is very difficult to visualize using current techniques (uterine manipulator, lap tools)
Found 1 closest match: 
Distances of: 0.0896807897105294 
MIGS (Hysterectomies): it's difficult to identify the cuff on the uterine manipulator so that the surgeon can cut the tissue at the correct location.
Found 1 closest match: 
Distances of: 0.09321698481066976 
MIGS (hysterectomy): Knowing how much tissue to remove is difficult using laparoscopic visualization. Separating the cervix from the vagina: sometimes either too much of the vagina is removed or not enough vagina (and cervical tissue) is removed. Not a common problem amongst high-volume surgeons (>10 cases/yr, which is actually not a lot).
Found 1 closest match: 
Distances of: 0.09546722738705071 
MIGS (Hysterectomies): smaller uteri can fit back through the vaginal opening, larger uteri must be removed through an incision in the naval 