In [55]:
import langchain_community
import langchain_text_splitters
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
import os
import pprint
from ollama import Client, chat
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate

In [22]:
# load the document and split it into pages
loader = PyPDFLoader("/data/local/aparraga/Bioinformatician/RAG/Publications/Parraga-Leo2023.pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
storage = text_splitter.split_documents(pages)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [33]:
# create the chroma client
import uuid
import chromadb
from chromadb.config import Settings

client = chromadb.HttpClient(host='localhost', port=8000, settings=Settings(allow_reset=True))
#collection = client.create_collection("prueba")
# collection = client.get_collection('tfm')

for doc in storage:
    collection.add(
        ids=[str(uuid.uuid1())], metadatas=doc.metadata, documents=doc.page_content
    )

# tell LangChain to use our client and collection name
db = Chroma(
    client=client,
    collection_name="prueba",
    embedding_function=embedding_function,
)

In [34]:
test = db.similarity_search("What is the implantation failure?",k=2)
test

[Document(metadata={'producer': 'Adobe PDF Library 15.0; modified using iText® 5.3.5 ©2000-2012 1T3XT BVBA (SPRINGER SBM; licensed version)', 'crossmarkmajorversiondate': '2010-04-23', 'creator': 'Adobe InDesign 15.1 (Windows)', 'page_label': '13', 'trapped': '/False', 'page': 12, 'subject': 'Reproductive Biology and Endocrinology, https://doi.org/10.1186/s12958-023-01131-4', 'crossmarkdomainexclusive': 'true', 'keywords': ';GATA6;Progesterone;Estrogen;miRNAs;TFs;Endometrial receptivity;Recurrent implantation failure;Infertility;Menstrual cycle regulation', 'title': 'Deciphering a shared transcriptomic regulation and the relative contribution of each regulator type through endometrial gene expression signatures', 'crossmarkdomains[2]': 'springerlink.com', 'robots': 'noindex', 'moddate': '2023-11-19T01:55:46+01:00', 'author': 'Antonio Parraga-Leo', 'creationdate': '2023-09-12T04:25:09+00:00', 'doi': '10.1186/s12958-023-01131-4', 'source': '/data/local/aparraga/Bioinformatician/RAG/Publi

In [40]:
test = db.as_retriever(search_kwargs={"k":2}).invoke("What is the implantation failure?")
"\n\n".join([test[1].page_content,test[0].page_content])

'Page 13 of 14\nParraga‑Leo\xa0et\xa0al. Reproductive Biology and Endocrinology           (2023) 21:84 \n \nAuthor details\n1 IVIRMA Global Research Alliance, IVI Foundation, Instituto de Investigación \nSanitaria La Fe (IIS La Fe), Av. Fernando Abril Martorell 106, Torre A, Planta \n1ª, 46026 Valencia, Valencia, Spain. 2 Department of Pediatrics, Obstetrics \nand Gynaecology, Universidad de Valencia, Av. Blasco Ibáñez 15, 46010 Valen‑\ncia, Valencia, Spain. 3 IVIRMA Global Research Alliance, IVIRMA Valencia, Plaza de \nLa Policia Local 3, 46015 Valencia, Spain. \nReceived: 21 June 2023   Accepted: 22 August 2023\nReferences\n 1. Strowitzki T, Germeyer A, Popovici R, von Wolff M. The human endo‑\nmetrium as a fertility‑determining factor. Hum Reprod Update. \n2006;12:617–30.\n 2. Cakmak H, Taylor HS. Implantation failure: Molecular mechanisms and \nclinical treatment. Hum Reprod Update. 2011;17:242–53.\n 3. Lessey BA, Young SL. What exactly is endometrial receptivity? Fertil Steril. \n

In [41]:
def get_context(query):

    client = chromadb.HttpClient(host='localhost', port=8000, settings=Settings(allow_reset=True))

    db = Chroma(
    client=client,
    collection_name="prueba",
    embedding_function=embedding_function,
    )

    docs = db.similarity_search(query)

    fullcontext = "\n\n".join([x.page_content for x in docs])

    return fullcontext


In [56]:
system_promt = """You are an expert consultant helping executive advisors to get relevant information from scientific articles and code related to reproduction and bioinformatics.

        Generate your response by following the steps below:
        1. Recursively break down the question into smaller questions to better understand it.
        2. For each question/directive:
            2a. Select the most relevant information from the context in light of the conversation history.
        3. Generate a draft response using selected information.
        4. Remove duplicate content from draft response.
        5. Generate your final response after adjusting it to increase accuracy and relevance.
        6. Do not try to summarize the answers, explain it properly.
        6. Only show your final response! 
        
        Constraints:
        1. DO NOT PROVIDE ANY EXPLANATION OR DETAILS OR MENTION THAT YOU WERE GIVEN CONTEXT. Only do that when questions are related to coding.
        2. Don't mention that you are not able to find the answer in the provided context.
        3. Ignore the part of the content that only contains references.
        3. Don't make up the answers by yourself.
        4. Try your best to provide answer from the given context.

        CONTENT:
        {content}

        Question:
        {question}
        """



prompt = PromptTemplate(
    template = system_promt,
    input_variables = ["content", "question"]
    )


In [None]:
prompt.format(question="What is a tree?", content="prueba") # This function writes the variable in each position.

"You are an expert consultant helping executive advisors to get relevant information from scientific articles and code related to reproduction and bioinformatics.\n\n        Generate your response by following the steps below:\n        1. Recursively break down the question into smaller questions to better understand it.\n        2. For each question/directive:\n            2a. Select the most relevant information from the context in light of the conversation history.\n        3. Generate a draft response using selected information.\n        4. Remove duplicate content from draft response.\n        5. Generate your final response after adjusting it to increase accuracy and relevance.\n        6. Do not try to summarize the answers, explain it properly.\n        6. Only show your final response! \n        \n        Constraints:\n        1. DO NOT PROVIDE ANY EXPLANATION OR DETAILS OR MENTION THAT YOU WERE GIVEN CONTEXT. Only do that when questions are related to coding.\n        2. Don'

In [None]:
def give_context(context):

    return f"""You are an expert consultant helping executive advisors to get relevant information from scientific articles and code related to reproduction and bioinformatics.

        Generate your response by following the steps below:
        1. Recursively break down the question into smaller questions to better understand it.
        2. For each question/directive:
            2a. Select the most relevant information from the context in light of the conversation history.
        3. Generate a draft response using selected information.
        4. Remove duplicate content from draft response.
        5. Generate your final response after adjusting it to increase accuracy and relevance.
        6. Do not try to summarize the answers, explain it properly.
        6. Only show your final response! 
        
        Constraints:
        2. Don't mention that you are not able to find the answer in the provided context.
        3. Ignore the part of the content that only contains references.
        3. Don't make up the answers by yourself.
        4. Try your best to provide answer from the given context.

        CONTENT:
        {context}
        """

In [82]:
def llm_response(context, query):
    client = Client(host='http://localhost:11434')

    stream = client.chat(
        model = "gemma3:12b",

        messages=[
            {"role":"system", "content":give_context(context)},
            {"role":"user", "content":query }
        ],
        stream = True
    )

    fullanswer=''
    for chunk in stream:
        print(chunk['message']['content'],end='',flush=True)
        fullanswer = ''.join([fullanswer, chunk['message']['content']])
    
    return fullanswer

In [83]:
llm_response(get_context("What is the CTCF?"), "What is the CTCF?")

CTCF is a conserved zinc finger protein with well-characterized regulatory functions throughout the human body. It acts as a transcriptional repressor involved in RNA polymerase II (Pol II) pausing, imprinting, X-chromosome inactivation, and functions as an insulator, blocking interactions between enhancers and promoters of neighboring genes. In the context of the endometrium, it's proposed to exert an inhibitory role during the proliferative phase, and its downregulation in the secretory endometrium allows for the derepression of genes necessary for the window of implantation (WOI). It also interacts with HOXA10, affecting endometrial proliferation and function.





"CTCF is a conserved zinc finger protein with well-characterized regulatory functions throughout the human body. It acts as a transcriptional repressor involved in RNA polymerase II (Pol II) pausing, imprinting, X-chromosome inactivation, and functions as an insulator, blocking interactions between enhancers and promoters of neighboring genes. In the context of the endometrium, it's proposed to exert an inhibitory role during the proliferative phase, and its downregulation in the secretory endometrium allows for the derepression of genes necessary for the window of implantation (WOI). It also interacts with HOXA10, affecting endometrial proliferation and function.\n\n\n\n"

In [None]:
client = Client(host='http://localhost:11434')

test = chat(model="gemma3:12b",
     messages=[{'role': 'user', 'content': 'Why is the sky blue?'}],
     stream=False) # Stream allows for wririntg word by word?

for t in test:
    print(t)

# ('model', 'gemma3:12b')
# ('created_at', '2025-06-06T13:47:06.777063741Z')
# ('done', True)
# ('done_reason', 'stop')
# ('total_duration', 132959742155)
# ('load_duration', 42392373)
# ('prompt_eval_count', 15)
# ('prompt_eval_duration', 190986853)
# ('eval_count', 672)
# ('eval_duration', 132725261170)
# ('message', Message(role='assistant')

('model', 'gemma3:12b')
('created_at', '2025-06-06T13:47:06.777063741Z')
('done', True)
('done_reason', 'stop')
('total_duration', 132959742155)
('load_duration', 42392373)
('prompt_eval_count', 15)
('prompt_eval_duration', 190986853)
('eval_count', 672)
('eval_duration', 132725261170)
('message', Message(role='assistant', content="Okay, let's break down why the sky is blue. It's a classic question with a surprisingly detailed scientific explanation! Here's the breakdown, from simple to slightly more complex:\n\n**1. Sunlight is Made of All Colors**\n\n*   Sunlight isn't actually white. It's a mixture of *all* the colors of the rainbow: red, orange, yellow, green, blue, indigo, and violet. You can see this when sunlight passes through a prism or when you see a rainbow (which is created when sunlight is refracted and reflected by water droplets).\n\n**2.  Earth's Atmosphere & Tiny Particles**\n\n*   Earth is surrounded by an atmosphere – a layer of gases including nitrogen, oxygen, and 