In [29]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

In [1]:
import os
import torch
import pickle

In [2]:
import chromadb
import os
import json
import uuid

In [None]:
from configs import models_config
from configs import db_config

In [57]:
from pprint import pprint

In [3]:
data_dir = os.path.join(os.getcwd(), 'data')

In [4]:
texts = {}

for filename in os.listdir(data_dir):
    filepath = os.path.join(data_dir, filename)

    with open(filepath) as file:
        text = file.read()
        texts[f'{filename.split(".")[0]}'] = text

In [6]:
keys_to_delete = []
for key, value in texts.items():
    if value == '':
        keys_to_delete.append(key)

for key in keys_to_delete:
    del texts[key]

In [8]:
texts_lst = list(texts.values())

In [31]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
text_splitter = SemanticChunker(HuggingFaceEmbeddings(model_name=models_config.EMBEDDING_MODEL_NAME, model_kwargs={'device': device}), breakpoint_threshold_type=models_config.CHUNKING_BREAKPOINT)

In [None]:
docs = text_splitter.create_documents(texts_lst)

In [32]:
model_embedding = HuggingFaceEmbeddings(model_name=models_config.EMBEDDING_MODEL_NAME, model_kwargs={'device': device})

In [None]:
embeddings = model_embedding.embed_documents([x.page_content for x in docs])

### Open from loaded

In [9]:
with open(os.path.join(os.getcwd(), 'docs.pkl'), 'rb') as f:
    docs = pickle.load(f)

In [10]:
with open(os.path.join(os.getcwd(), 'embeddings.pkl'), 'rb') as f:
    embeddings = pickle.load(f)

In [11]:
docs[0:3]

[Document(page_content="Penile cancer starts in or on the penis. Cancer starts when cells begin to grow out of control. Cells in nearly any part of the body can become cancer, and can spread to other parts of the body. To learn more about how cancers start and spread, see What Is Cancer? About the penis\nThe penis is the external male sex organ. It's also part of the urinary system. It's made up of many types of body tissues, including skin, nerves, smooth muscle, and blood vessels.The American Cancer Society estimates for penile cancer in the United States for 2024 are:\n\nAbout 2,100 new cases of penile cancer diagnosed\nAbout 500 deaths from penile cancer\n\nFor statistics related to survival, see Survival Rates for Penile Cancer. Penile cancer is rare in North America and Europe. It's diagnosed in fewer than 1 man in 100,000 each year and accounts for fewer than 1% of cancers in men in the United States. Penile cancer is much more common in some parts of Asia, Africa, and South Ame

In [12]:
embeddings[0]

[0.008598961867392063,
 -0.003967630676925182,
 -0.0006065233028493822,
 -0.020392857491970062,
 0.025290999561548233,
 0.04458964616060257,
 0.05590782314538956,
 -0.006728933192789555,
 0.04095790162682533,
 0.006288500037044287,
 0.04194159060716629,
 -0.02302701584994793,
 0.002274874597787857,
 0.0040405248291790485,
 -0.09322895109653473,
 -0.029570389539003372,
 -0.03306207433342934,
 -0.03410869091749191,
 -0.01917378604412079,
 0.0029412489384412766,
 0.01386633887887001,
 -0.0058618816547095776,
 0.002881762105971575,
 -0.04043223336338997,
 0.059105269610881805,
 -0.04979551210999489,
 -0.0067557054571807384,
 -0.060590971261262894,
 -0.01926354318857193,
 -0.14210860431194305,
 0.01670099049806595,
 -0.007724265567958355,
 -0.0633479580283165,
 -0.03778870776295662,
 -0.04141457378864288,
 -0.046574514359235764,
 0.01582512818276882,
 -0.0024577162694185972,
 0.0021800321992486715,
 -0.007535440847277641,
 0.026858005672693253,
 -0.001036448054946959,
 -0.020214950665831566

### Save to vector db (Chromadb)

In [16]:
chroma_client = chromadb.PersistentClient(path=os.path.join(os.getcwd(), db_config.DB_NAME))

In [17]:
collection = chroma_client.get_or_create_collection(name=db_config.COLLECTION_NAME)

In [25]:
def insert_record(collection, chunk, embedding):
    collection.add(
        embeddings=[embedding],
        documents=[chunk],
        # metadatas=[{'metadata': "1"}],
        ids=[str(uuid.uuid4())],
    )

In [27]:
for chunk, embedding in zip(docs, embeddings):
    insert_record(collection, chunk.page_content, embedding)

### Test query from db

In [70]:
def find_embedding(model, collection, query: str, n_results=3):
    query_vector = model.embed_query(query)
    query_result = collection.query(
        query_embeddings=[query_vector],
        n_results=n_results,
    )

    # pprint(query_result)

    examples = ""

    for res in query_result['documents'][0]:
        try:
            examples += f"Result:\n{res}\n"
            examples += "-"*20 + '\n'
        except:
            continue

    return examples

In [71]:
examples = find_embedding(model_embedding, collection, 'What is bone cancer?', n_results=1)

In [74]:
print(examples)

Result:
See Second Cancers in Adults for more information about second cancers.What is bone cancer? Cancer starts when cells begin to grow out of control. Cells in nearly any part of the body can become cancer, and can then spread to other parts of the body. To learn more about cancer and how it starts and spreads, see What Is Cancer? Primary bone cancers start when the cells in the bone start to grow out of control. Primary bone cancer versus bone metastasis
Primary bone cancers start in bones. Most bone cancers in children and teens are primary bone cancers. But in adults, most cancers in the bones started in a different organ and then spread to the bones. This is known as bone metastasis, and it can happen with some common cancers like breast, prostate, or lung cancer. For example, breast cancer that spreads to the bones is not bone cancer, it’s metastatic breast cancer. The cancer cells in the bone look like the cancer cells in the breast, and they're treated the same way. Types of

In [76]:
examples = find_embedding(model_embedding, collection, 'What are the risks of eye cancer?', n_results=3)

In [77]:
print(examples)

Result:
And many people who get the disease may have few or no known risk factors. Race/ethnicity
The risk of eye melanoma is much higher in White people than in African Americans, Hispanics, or Asian Americans. Eye color
People with light colored eyes are somewhat more likely to develop uveal melanoma of the eye than are people with darker eye and skin color. Age and sex
Eye melanomas can occur at any age, but the risk goes up as people get older. Eye melanoma is slightly more common in men than in women. Certain inherited conditions
People with dysplastic nevus syndrome, who have many abnormal moles on the skin, are at increased risk of skin melanoma. They also seem to have a higher risk of developing melanoma of the eye. People with abnormal brown spots on the uvea (known as oculodermal melanocytosis or nevus of Ota) also have an increased risk of developing uveal eye melanoma. BAP1 cancer syndrome is a rare inherited condition in which family members are at increased risk for uveal