# Chroma Database Generation

# Library

In [1]:
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import json
import uuid
import pickle as pkl
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import init_chat_model
import helper_functions as hf

## Config

In [2]:
with open('api_google.txt') as f:
    
    api_key = json.load(f)

In [161]:
if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = api_key['key']

## LLM model

In [None]:
llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai",temperature=0,max_output_tokens=1024)

## Load data

The information for each article is structured in JSON format.

- `info_articles_main`: JSON file containing scientific papers from our research group.
- `info_articles_ref_final`: JSON file containing scientific papers referenced by the papers from our research group.

In [3]:
with open("info_articles_main.pkl","rb") as f:
    info_articles_main = pkl.load(f)
with open("info_articles_ref_final.pkl","rb") as f:
    info_articles_ref = pkl.load(f)

## Database

### Split the text

We use RecursiveCharacterTextSplitter. It splits text by recursively look at characters. Recursively tries to split by different characters to find one that works.

In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # We selected this threshold according to the performance of the model.
    chunk_overlap=100,  # 1000/10
    separators=["\n\n", "\n", ".", "!", "?", " "]  # smart splitting
)

We merge both JSON files into `info_articles_final`

In [5]:
info_articles_final = info_articles_main + info_articles_ref
f'We have a total of {len(info_articles_final)} scientific articles'

'We have a total of 265 scientific articles'

In [6]:
info_splitted = hf.chunk_makers(info_articles_final, splitter=splitter)

In [8]:
info_splitted[10]

{'chunk_index': 1,
 'content': 'Almudena Devesa-Peiro et al.,2020 Uterine disorders affecting female fertility: what are the molecular functions altered in endometrium?, DOI:https://doi.org/10.1016/j.fertnstert.2020.01.025:\n . The search identified experiments involving human endometrial transcriptomic case versus control raw data related to uterine pathologies and implantation alterations. The keywords employed in the search included endometriosis, endometrial adenocarcinoma (ADC), recurrent implantation failure (RIF), and recurrent pregnancy loss (RPL), among others (Supplemental Table 1A, available online, for a full list of search terms). No restrictions were placed on publication date or language. Uterine leiomyoma, adenomyosis, and uterine leiomyosarcoma data were not included due to a lack of suitable studies meeting our criteria. For each sample cohort belonging to the same individual study, 39 variables were evaluated (see Supplemental Table 1B), including clinical characteri

## Embedding

We chose this embedding according to leaderboard of HuggingFace and the results of the performance of the model.

In [9]:
embedding_function2 = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2")

  from .autonotebook import tqdm as notebook_tqdm


## Adding texts to ChromaDB

In [10]:
# Prepare documents, metadata, and IDs
texts = [chunk["content"] for chunk in info_splitted]
metadatas = [{"parent": chunk["parent"], "chunk_index": chunk["chunk_index"],"DOI": chunk["DOI"], "Reference": chunk["Reference"]} for chunk in info_splitted]
ids = [str(uuid.uuid1()) for _ in metadatas]

In [11]:
db = Chroma.from_texts(
    texts=texts,
    embedding=embedding_function2,
    metadatas=metadatas,
    ids=ids,
    collection_name="ReproRAG",
    persist_directory="./chromaRepro" # Persist directory to be read in the app.py (streamlit)
)

We check that the search works properly

In [31]:
db.similarity_search("Is there a signature to predict endometrial disruption?", 10) # Cosine by default

[Document(metadata={'Reference': 'Patricia Diaz-Gimeno et al.,2024', 'chunk_index': 12, 'DOI': 'https://doi.org/10.1016/j.fertnstert.2024.03.015', 'parent': 'Methods'}, page_content='Patricia Diaz-Gimeno et al.,2024 Predicting risk of endometrial failure: a biomarker signature that identifies a novel disruption independent of endometrial timing in patients undergoing hormonal replacement cycles, DOI:https://doi.org/10.1016/j.fertnstert.2024.03.015:\n EFR signature capability as a biomarker of endometrial disruption\nTo estimate the prediction capability of the EFR signature to distinguish poor and good endometrial prognosis profiles, we implemented predictive models on the basis of the Support Vector Machine algorithm. Using a stratified fivefold crossvalidation process (repeated 100 times), the accuracy, sensitivity, and specificity were estimated. The range of values obtained across the 100 iterations was presented as a boxplot using ggplot2.\nStatistical analysis'),
 Document(metada