# Import & Paths


In [1]:
import ast
import json
import numpy as np
import openai
import os
import pandas as pd

import llama_index
from llama_index.embeddings import OpenAIEmbedding
from typing import Union, Dict, Optional

print(llama_index.__version__)

0.9.38


In [2]:
MAIN_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(MAIN_DIR, "data")
DATABASE_DIR = os.path.join(DATA_DIR, "db")
REFERENCE_DIR = os.path.join(DATA_DIR, "reference")
CONSUMER_DIR = os.path.join(DATA_DIR, "consumer")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)
    
os.environ["OPENAI_API_KEY"] = api_keys["LEQUAN_OPENAI_KEY"]
openai.api_key = api_keys["LEQUAN_OPENAI_KEY"]

metadata_df = pd.read_csv(os.path.join(DATA_DIR, "exp_metadata.csv"))
master_metadata = json.load(open(os.path.join(DATA_DIR, "metadata", "master_metadata.json")))

In [3]:
SHAPE_DESCRIPTION = {
    "BULLET" : "Has the shape of a cylinder when viewing the capsule from a point perpendicular to its longest side, with a point at one end. The bullet shape is synonymous with torpedo shape",
    "CAPSULE" : "Has the shape of a capsule when viewing the capsule from a point perpendicular to its longest side. The capsule shape is reserved for two-part capsules and banded two-part capsules.",
    "CLOVER" : "Hast the stylized shape, of clover, having three or four leaves with or without the stem", 
    "DIAMOND" : "Has the shape of four distinct sides making a parallelogram with two inner obtuse angles and two inner acute angles",
    "DOUBLE CIRCLE" : "Has the shape of two circles joined together, and usually overlapping", 
    "FREEFORM" : "Does not have any standard shape (free-form) which does not have 3 or more distinct sides and do",
    "GEAR" : "Has the two dimensional shape of a toothed wheel of a machine part.", 
    "HEPTAGON" : "Has the shape with seven distinct sides", 
    "HEXAGON (6 sided)" : "Has the shape with six distinct sides",
    "OCTAGON (8 sided)" : "Has The shape with eight distinct sides",
    "OVAL" : "Has the shape of either an oval or an ellipse", 
    "PENTAGON (5 sided)" : "Has the shape with five distinct sides",
    "RECTANGLE" : "Has the shape with four distinct sides and four right angles but is not a square",
    "ROUND" : "Has the shape of a circle/spherical",
    "SEMI-CIRCLE" : "Has the shape of half of a circle or half of an oval",
    "SQUARE" : "Has the shape with four distinct and equal sides with four right angles",
    "TEAR" : "Has the shape of a tear/drop of water. ",
    "TRAPEZOID" : "Has the shape with four distinct sides whereas only two of the sides are parallel",
    "TRIANGLE" : "Has the shape with three distinct sides" }

SCORE_DESCRIPTION = {
    "1": "Pill is not scored to break into smaller dosage parts",
    "2": "Pill can be broken into two (2) equal dosage parts",
    "3": "Pill can typically be broken into two (2) or three (3) equal dosage parts",
    "4": "Pill can typically be broken into two (2) or four (4) equal dosage parts"
}

IMPRINT_TYPE_DESCRIPTION = {
    "DEBOSSED": "Imprint is depressed into the surface of the pill", 
    "BOSSED": "Imprint is raised up from the surface of the pill",
    "PRINTED": "Imprint is printed onto the surface of the pill"
}

def format_metadata(
    metadata: Union[str, Dict], prefix: Optional[str] = "Image features:", suffix: Optional[str] = None, separator: str = "\n- ",
    extra_descriptions: bool = True
) -> str:
    if isinstance(metadata, str):
        metadata = ast.literal_eval(metadata)
    assert isinstance(metadata, Dict), "Metadata must be a dictionary"
    if extra_descriptions:
        metadata["Shape"] = metadata["Shape"] + " - " + SHAPE_DESCRIPTION[metadata["Shape"]] \
            if SHAPE_DESCRIPTION.get(metadata["Shape"]) else metadata["Shape"]
        metadata["Score"] = SCORE_DESCRIPTION[metadata["Score"]] \
            if SCORE_DESCRIPTION.get(metadata["Score"]) else metadata["Score"]   
        metadata["ImprintType"] = metadata["ImprintType"] + " - " + IMPRINT_TYPE_DESCRIPTION[metadata["ImprintType"]] \
            if IMPRINT_TYPE_DESCRIPTION.get(metadata["ImprintType"]) else metadata["ImprintType"] 

    metadata_content = separator.join([f"{k}: {v}" for k, v in metadata.items()])
    if prefix:
        metadata_content = prefix + separator + metadata_content
    if suffix:
        metadata_content = metadata_content + separator + suffix
        
    return metadata_content

# Text embeddings generation

In [5]:
formatted_metadatas = [format_metadata(metadata) for metadata in metadata_df["metadata"]]
print(formatted_metadatas[0])

Image features:
- Shape: CAPSULE - Has the shape of a capsule when viewing the capsule from a point perpendicular to its longest side. The capsule shape is reserved for two-part capsules and banded two-part capsules.
- Color: WHITE
- Imprint: Lilly;3227;10;mg
- ImprintColor: BLACK
- ImprintType: PRINTED - Imprint is printed onto the surface of the pill
- Score: Pill is not scored to break into smaller dosage parts


In [6]:
TEXT_EMBED_MODEL = "text-embedding-3-large"
embeddings = OpenAIEmbedding(model=TEXT_EMBED_MODEL)
text_embs = embeddings.get_text_embedding_batch(formatted_metadatas, show_progress=True)

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 2000/2000 [00:42<00:00, 47.16it/s]


In [7]:
text_emb_dict = {}
for ndc11, image_file, text_emb, metadata, text_content in zip(metadata_df["ndc11"], metadata_df["first_reference"], text_embs, metadata_df["metadata"], formatted_metadatas):
    name = master_metadata[image_file]['name']
    text_emb_dict[ndc11] = {
        "name": name, "text_emb": text_emb, "metadata": metadata, "text_content": text_content
    }

with open(os.path.join(DATA_DIR, "embeddings", f"REFERENCE_EXTRADISP_{TEXT_EMBED_MODEL}.json"), "w") as f:
    json.dump(text_emb_dict, f)

# Image Embeddings generation

# Hybrid Database Creation

In [None]:
from llama_index.indices.base_retriever import BaseRetriever

class MultiModalRetriever(BaseRetriever):
    