# Chroma Database Generation

# Library

In [2]:
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import json
import uuid
import pickle as pkl
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import init_chat_model
import helper_functions as hf

## Config

In [2]:
with open('api_google.txt') as f:
    
    api_key = json.load(f)

In [3]:
if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = api_key['key']

## LLM model

In [None]:
llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai",temperature=0,max_output_tokens=1024) # gemma-3-27b-it (open)

## Load data

The information for each article is structured in JSON format.

- `info_articles_main`: JSON file containing scientific papers from our research group.
- `info_articles_ref_final`: JSON file containing scientific papers referenced by the papers from our research group.

In [3]:
with open("info_articles_main.pkl","rb") as f:
    info_articles_main = pkl.load(f)
with open("info_articles_ref_final.pkl","rb") as f:
    info_articles_ref = pkl.load(f)

## Database

### Split the text

We use RecursiveCharacterTextSplitter. It splits text by recursively look at characters. Recursively tries to split by different characters to find one that works.

In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,  # We selected this threshold according to the performance of the model.
    chunk_overlap=150,  # 1500/10
    separators=["\n\n", "\n", ".", "!", "?", " "]  # smart splitting
)

We merge both JSON files into `info_articles_final`

In [5]:
info_articles_final = info_articles_main + info_articles_ref
f'We have a total of {len(info_articles_final)} scientific articles'

'We have a total of 265 scientific articles'

In [6]:
info_splitted = hf.chunk_makers(info_articles_final, splitter=splitter)

## Embedding

We chose this embedding according to leaderboard of HuggingFace and the results of the performance of the model.

In [None]:
embedding_function2 = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2")  #intfloat/e5-small-v2

## Adding texts to ChromaDB

In [None]:
# Prepare documents, metadata, and IDs
texts = [chunk["content"] for chunk in info_splitted]
metadatas = [{"parent": chunk["parent"], "chunk_index": chunk["chunk_index"],"DOI": chunk["DOI"], "Reference": chunk["Reference"]} for chunk in info_splitted]
ids = [str(uuid.uuid1()) for _ in metadatas]

In [None]:
db = Chroma.from_texts(
    texts=texts,
    embedding=embedding_function2,
    metadatas=metadatas,
    ids=ids,
    collection_name="ReproRAG",
    persist_directory="./chromaRepro" # Persist directory to be read in the app.py (streamlit)
)

We check that the search works properly

In [None]:
db.similarity_search("Is there a signature to predict endometrial disruption?", 10) # Cosine by default

[Document(metadata={'chunk_index': 2, 'parent': 'Discussion', 'Reference': 'P. Sebastian-Leon et al.,2018', 'DOI': 'https://doi.org/10.1093/humrep/dey023'}, page_content='P. Sebastian-Leon et al.,2018, DOI:https://doi.org/10.1093/humrep/dey023:\n ., 2008), or have good predictive value for both, such as Altmäe2010 who designed the study for unexplained infertility versus fertile controls (Altmäe et al., 2010). On the other hand, in the pathological prediction model, most of the signatures designed to distinguish between fertility and infertility improved their accuracy using correction by transcriptomic clusters instead of LH criteria ( Koot et al.,2 0 1 6). Based on thesefindings, we conclude that endometrial timing is a confounding variable that has been covering up the molecular disruption effect and that the improvement of the prediction due to transcriptomic correction reveals that the transcriptomic cluster provides better criteria than LH for timing effect removal. In addition, 