# Chroma Database Generation

# Library

In [1]:
import langchain_community
import langchain_text_splitters
from langchain_community.document_loaders import PyPDFLoader, pdf
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
import os
import pprint
import re
from langchain_core.documents import Document
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field
import json
import uuid
import chromadb
from chromadb.config import Settings
import unicodedata
from langchain_google_genai import GoogleGenerativeAI
import uuid
# from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pickle as pkl
import requests
import subprocess
import pandas as pd
from bs4 import BeautifulSoup
import tqdm
import time
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
import datetime
import time

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
import getpass
import os

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

In [44]:
from langchain.chat_models import init_chat_model
llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai",temperature=0,max_output_tokens=1024) # gemma-3-27b-it

## Load data

In [2]:
with open("info_articles_main.pkl","rb") as f:
    info_articles_main = pkl.load(f)
with open("info_articles_ref_final.pkl","rb") as f:
    info_articles_ref = pkl.load(f)

## Database

### Split the text

In [3]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "!", "?", " "]  # smart splitting
)

In [4]:
info_articles_infal = info_articles_main + info_articles_ref
len(info_articles_infal)

265

In [19]:
info_splitted = []

for j in info_articles_infal:

    for key, value in j.items():
    
        if key in ['Abstract', 'Introduction', 'Methods', 'Results', 'Discussion', 'Conclusion',] and value:

            if len(value) > 1200:
                chunks = splitter.split_text(value)

                for i, c in enumerate(chunks):

                    info_splitted.append(
                        {
                            "chunk_index":i,
                            "content": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")+", DOI:"+j.get("DOI")+"\n"+c,
                            "parent":key,
                            "split":True,
                            "DOI":j.get("DOI"),
                            "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")
                        }
                    )
            else:

                info_splitted.append(
                        {
                            "chunk_index":0,
                            "content":j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")+", DOI:"+j.get("DOI")+"\n"+value,
                            "parent":key,
                            "split":False,
                            "DOI":j.get("DOI"),
                            "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Publication',"Not identified")
                        }
                    )

## Embedding

We chose this embedding according to leaderboard of HuggingFace

In [25]:
embedding_function2 = HuggingFaceEmbeddings(model_name="avsolatorio/GIST-small-Embedding-v0",model_kwargs={'device': 'cuda'})

## Chroma

In [20]:
# 2. Prepare documents, metadata, and IDs
texts = [chunk["content"] for chunk in info_splitted]
metadatas = [{"parent": chunk["parent"], "chunk_index": chunk["chunk_index"],"DOI": chunk["DOI"], "Reference": chunk["Reference"]} for chunk in info_splitted]
ids = [str(uuid.uuid1()) for _ in metadatas]

In [24]:
info_articles_main

[{'PaperTitle': 'Uterine disorders affecting female fertility: what are the molecular functions altered in endometrium?',
  'Publication': '2020',
  'Authors': 'Almudena Devesa-Peiro, M.Sc., Patricia Sebastian-Leon, Ph.D., Francisco Garcia-Garcia, Ph.D., Vicente Arnau, Ph.D., Alejandro Aleman, M.Eng., Antonio Pellicer, M.D., Ph.D., and Patricia Diaz-Gimeno, Ph.D.',
  'Email': 'patricia.diaz@ivirma.com',
  'Abstract': 'Objective: To determine the molecular functions of genes exhibiting altered expression in the endometrium of women with uterine disorders affecting fertility. Design: Retrospective analysis integrating case and control data from multiple cohorts with endometrium gene expression in women with uterine disorders. Setting: Infertility research department affiliated with a university hospital. Patient(s): Two hundred and forty women, 121 of whom were controls, 119 of whom had endometrial adenocarcinoma (ADC), recurrent implantation failure (RIF), recurrent pregnancy loss (RPL)

In [22]:
db = Chroma.from_texts(
    texts=texts,
    embedding=embedding_function2,
    metadatas=metadatas,
    ids=ids,
    collection_name="ReproRAG",
    persist_directory="./chromaRepro"
)

We check that the search works properly

In [23]:
db.similarity_search("Is there a signature to predict endometrial disruption?", 2)

[Document(metadata={'DOI': 'https://doi.org/10.1093/humrep/deab262', 'parent': 'Results', 'Reference': 'P. Diaz-Gimeno et al.,2022', 'chunk_index': 10}, page_content='P. Diaz-Gimeno et al.,2022, DOI:https://doi.org/10.1093/humrep/deab262\nPredictive performance comparison of signatures and consistency across endometrial datasets'),
 Document(metadata={'chunk_index': 0, 'DOI': 'https://doi.org/10.1016/j.fertnstert.2024.03.015', 'Reference': 'Patricia Diaz-Gimeno et al.,2024', 'parent': 'Abstract'}, page_content='Patricia Diaz-Gimeno et al.,2024, DOI:https://doi.org/10.1016/j.fertnstert.2024.03.015\nObjective: To propose a new gene expression signature that identifies endometrial disruptions independent of endometrial luteal phase timing and predicts if patients are at risk of endometrial failure. Design: Multicentric, prospective study. Setting: Reproductive medicine research department in a public hospital affiliated with private fertility clinics and a reproductive genetics laboratory

## Evaluation

### Agents for questions

In [33]:
def call_llm(llm, prompt):
    response = llm.invoke(prompt)
    return response.content

In [29]:
QA_generation_prompt = """
Your task is to write a  question and an answer given a context.
Your question should be answerable with a specific, concise piece of factual information from the context.
Your  question should be formulated in the same style as questions users could ask in a search engine.
This means that your  question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Question: (your question)
Answer: (your answer to the question)

Now here is the context.

Context: {context}\n
Output:::
"""

In [61]:
import random
str(info_articles_infal)



In [68]:
info_splitted_evaluation = [d for d in info_splitted if d['parent'] in ['Abstract','Introduction','Results','Conclusion','Discussion','Methods']]

In [72]:
print(info_splitted_evaluation[6])
"\n\n".join([d['content'] for d in info_splitted_evaluation if d['Reference'] == info_splitted_evaluation[6]['Reference'] and d['parent'] ==  info_splitted_evaluation[6]['parent']])

{'chunk_index': 3, 'content': 'Almudena Devesa-Peiro et al.,2020, DOI:https://doi.org/10.1016/j.fertnstert.2020.01.025\n. The results of individual studies, however, lack reproducibility  due to low sample sizes, differing experimental designs, and the application of different data analyses . This prevents the elucidation of the true effects of these conditions on the endometrium and a detailed understanding of links between their mechanisms. Improved and comparable analysis from raw data facilitates comparisons between studies. Raw data from individual studies are commonly shared in genomic repositories like the Gene Expression Omnibus (GEO)  and Sequence Read Archive (SRA) . New results obtained via comparable analytical procedures from the raw data of individual studies can be integrated through meta-analyses  that enable comparisons within larger data sets to answer new questions  or prior questions with a higher degree of confidence . Meta-analyses can identify the most dysregulat

'Almudena Devesa-Peiro et al.,2020, DOI:https://doi.org/10.1016/j.fertnstert.2020.01.025\nT he human endometrium is hormonally regulated and becomes receptive to embryonic implantation for 2 to 4 days during the midsecretory phase . This delimited time, called the window of implantation (WOI), involves the precise coordination of hormone signals . Pathologic conditions that alter endometrial function may disrupt the WOI, interrupt implantation, and lead to subfertility. Identifying conditions that impact endometrial function and result in a disrupted WOI is important for the development of fertility treatment strategies. Uterine disorders are complex, polygenic, and multifactorial alterations that often compromise female fertility. This includes both well-established uterine pathologies, Received October 2, 2019; revised December 26, 2019; accepted January 17, 2020. A.D.-P. has nothing to disclose. P.S.-L. has nothing to disclose. F.G.-G. has nothing to disclose. V.A. has nothing to di

In [73]:
N = 50
examples = []
for sample in random.sample(info_splitted_evaluation,N):
    context = "\n\n".join([d['content'] for d in info_splitted_evaluation if d['Reference'] == sample['Reference'] and d['parent'] == sample['parent']])
    response = call_llm(llm=llm,prompt=QA_generation_prompt.format(context=context))
    
    try:
        question = response.split("Question:")[-1].split("Answer: ")[0].strip()
        answer = response.split("Answer: ")[-1].strip()
        examples.append({
            "context" : sample,
            "question" : question,
            "answer" : answer
        })
    except:
        continue

In [74]:
examples

[{'context': {'chunk_index': 2,
   'content': 'Signe Altmäe et al.,2013, DOI:https://doi.org/10.1093/humupd/dmt048\n. With the expansion of ‘omics’ analyses in the study of the endometrium, there is a growing need to develop guidelines for the design of studies, and the analysis and interpretation of ‘omics’ data.',
   'parent': 'Abstract',
   'split': True,
   'DOI': 'https://doi.org/10.1093/humupd/dmt048',
   'Reference': 'Signe Altmäe et al.,2013'},
  'question': 'What omics technique has been most commonly applied in human endometrium analysis?',
  'answer': 'Analysis of endometrial transcriptome patterns'},
 {'context': {'chunk_index': 3,
   'content': 'Luz Garcia-Alonso et al.,2021, DOI:https://doi.org/10.1038/s41588-021-00972-2\n. The spatial arrangement of cells is key to understanding a morphologically complex tissue such as the endometrium, where a cell’s function may differ depend - ing on signals it receives from neighboring cells17. Many spatially resolved transcriptomics 