In [48]:
# imports libs
import openai
from dotenv import dotenv_values
# config .env and import api_key
config = dotenv_values('.env')
openai.api_key = config['OPENAI_API_KEY']

In [49]:
# import more libs
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

In [78]:
# locate filepath, import test data
dataset_path = "./source_data/Problem_Intake_CurrentVers_TEST.csv"
df = pd.read_csv(dataset_path)

In [79]:
# look at imported df
df.head(5)

Unnamed: 0,date,rank,need,dept,contact
0,2019/08/16,H,"A way to address lack of “buy in"" in any numbe...",Obstetrics and Gynecology - Minimally Invasive...,Remington Stokes
1,2019/08/16,H,A way to address dyspareunia in women with hig...,Obstetrics and Gynecology - Minimally Invasive...,Otto Waters
2,2019/08/16,H,A way to address pain flare in patients with c...,Obstetrics and Gynecology - Minimally Invasive...,Grant Brooks
3,2019/08/16,L,A way to address ovarian cyst seen on ultrasou...,Obstetrics and Gynecology - Minimally Invasive...,Adeline Smith
4,2019/08/16,L,A way to address hx of an incorrect dx of “pro...,Obstetrics and Gynecology - Minimally Invasive...,Nicole Reyes


In [82]:
# sort df by most recent
needs = df.sort_values("date", ascending=False)

In [83]:
# set enc var for OpenAI
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [84]:
# pull need statements into array
need_statements = needs["need"].values

array(['A way to address A way to correctly diagnose initiating cause of Shortness of Breath in Pts admitted to ED with complex respiratory cardiovascular/respiratory histories that reduces unnecessary treatments/decreases average length of stay\n in admitted to ED with complex respiratory cardiovascular/respiratory histories that reduces unnecessary treatments/decreases average length of stay',
       'A way to address assess cause of vaginal discharge in patients with recurrent yeast/bv (bacterial vaginosis?) that reduces clinic cisits or unnecessary tx.',
       'A way to address intro-op identify planes between bowel and other tissue in surgical patients that prevents bowel injury',
       'A way to address eliminate instrument exchange in patients undergoing surgery that reduces operative time',
       'A way to address identify nerve origin of pain in complex pelvic pain patients that targets therapy',
       'A way to address decrease no shows in patients unable to make an appoi

In [85]:
# calc tokens from needs statements array
total_tokens = sum([len(enc.encode(need)) for need in need_statements])

In [86]:
total_tokens
# reference OpenAI docs for model price / 1k tokens
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.6f}")

Estimated cost $0.000286


In [87]:
# decorator function to query more responsibly, from tenacity
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance
    text = text.replace("\n", " ")
    # call OpenAI embeddings API
    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [59]:
# establsih a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding saved as a pickle file

# set path to embedding cache
embedding_cache_path = "needs.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path ,"wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embeddings_from_string(
        string,
        model="text-embedding-ada-002",
        embedding_cache=embedding_cache
):
    """Return embedding of a given a string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPEN AI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]


In [60]:
# Retreieve embeddings for data, if cache present, otherwise query OpenAI 
need_embeddings = [embeddings_from_string(need, model="text-embedding-ada-002") for need in need_statements]

In [66]:
# convert df to python dict
data = needs[["contact", "dept", "need"]].to_dict("records")

In [29]:
# import nomic lib
from nomic import atlas

In [69]:
# send embeddings with Atlas
project = atlas.map_embeddings(
    embeddings=np.array(need_embeddings),
    data=data
)

[32m2023-05-13 15:07:47.136[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `brief-disadvantage` in organization `akantunc`[0m
[32m2023-05-13 15:07:48.474[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
1it [00:00,  1.09it/s]
[32m2023-05-13 15:07:49.398[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-05-13 15:07:49.399[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-05-13 15:07:50.685[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `brief-disadvantage` in project `brief-disadvantage`: https://atlas.nomic.ai/map/307a544a-a17d-45d6-aba4-63ee2b5cbc6f/d973a64d-675e-40a8-be4a-85c036b898f6[0m
[32m2023-05-13 15:07:50.686[0m | [1mINFO    [0m | [

In [70]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [73]:
def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embeddings-ada-002"
):
    # Get all embeddings
    embeddings = [embeddings_from_string(string) for string in strings]
    # get embedding for our specific query string
    query_embedding = embeddings[index_of_source_string]
    # get distances between our embedding and all others
    distances = distances_from_embeddings(query_embedding, embeddings)
    # get indices of the knn
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    # loop over indices to match
    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        if query_string == strings[i]:
            continue
        if match_count >= k_nearest_neighbors:
            break
        match_count =+ 1
        print(f"Found {match_count} closest match: ")
        print(f"Distances of: {distances[i]} ")
        print(strings[i])

In [91]:
print_recommendations_from_strings(need_statements, 3)

GOT EMBEDDING FROM OPEN AI FOR A way to address eli
GOT EMBEDDING FROM OPEN AI FOR A way to address dec
GOT EMBEDDING FROM OPEN AI FOR A way to address org
Found 1 closest match: 
Distances of: 0.11504858089719694 
A way to address organize cords from surgical instruments in ORs that decreases time detangling cords
Found 1 closest match: 
Distances of: 0.12470794141311614 
A way to address identify bowel injury intra-op in surgical patients  that prevents delayed bowel injury and prompts intraop repair
Found 1 closest match: 
Distances of: 0.13082272492836688 
A way to address intro-op identify planes between bowel and other tissue in surgical patients that prevents bowel injury
Found 1 closest match: 
Distances of: 0.14036155036799824 
A way to address prioritize inpatient orders to facilitate clinical care in ? that reduce patient wait times
Found 1 closest match: 
Distances of: 0.14597601486879952 
A way to address confirm bowel adhesions requiring colorectal surgery in patients wit