In [7]:
import langchain_community
import langchain_text_splitters
from langchain_community.document_loaders import PyPDFLoader, pdf
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
import os
import pprint
import re
from langchain_core.documents import Document
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field
import json

## Google

In [6]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.messages import HumanMessage

In [9]:
with open('/data/local/aparraga/Bioinformatician/RAG/api_google.txt') as f:
    
    api_key = json.load(f)

In [41]:
llm = GoogleGenerativeAI(model="gemini-2.0-flash",api_key=api_key['key'],temperature=0.2)

In [38]:
# load the document and split it into pages
loader = PyPDFLoader("Publications/Parraga-Leo2023.pdf", mode="single")
doc = loader.load()

In [50]:
print(len(doc[0].page_content))

67457


In [73]:
class paper(BaseModel):

    PaperTitle: str = Field(description="The full title of the research paper")
    Publication: str = Field(description="Year: The year the paper was published")
    Authors: str = Field(description="The full names of all authors of the paper")
    Email: str = Field(description="The email address of the author (if provided)")
    Abstract: str = Field(description="The full text of the paper's abstract.")
    Introduction: str = Field(description="The full text fo the paper's introduction. Bear in mind that it can have other names such as background.")
    Methods: str = Field(description="The full text fo the paper's methods. Don't take the information from abstract.")
    Results: str = Field(description="The full text fo the paper's results. Don't take the information from abstract. Please take all possible text of results, this section could be divided into different sections.")
    Discussion: str = Field(description="The full text fo the paper's discussion if provided. Otherwise leave the filed blank. It should be entitled 'Discussion'")
    Conclusion: str = Field(description="The full text fo the paper's conclusion if provided. Otherwise leave the filed blank. Please don't take this information form abstract.")
    URL: str = Field(description="the link where you can find the article. This link is also known as DOI.")

In [63]:
parser = JsonOutputParser(pydantic_object=paper)

prompt = PromptTemplate(
    template="""
    You are an expert in analyzing scientific research papers. Please carefully read the provided research paper above and extract the following key information:
Extract these nine (9) properties from the research paper:

Paper Title: The full title of the research paper
Publication Year: The year the paper was published

Authors: The full names of all authors of the paper

Email: The email address of the author (if provided)

Abstract: The full text of the paper's abstract

Introduction: The full text fo the paper's introduction. Bear in mind that it can have other names such as background.

Methods: The full text fo the paper's methods. Don't take the information from abstract.

Results: The full text fo the paper's results. Don't take the information from abstract. Please take all possible text of results, this section could be divided into different sections.

Discussion:The full text fo the paper's discussion if provided. Otherwise leave the filed blank. It shoudl be entitled "Discussion"

Conclusion: The full text fo the paper's conclusion if provided. Otherwise leave the filed blank. Please don't take this information form abstract.

URL: the link where you can find the article. This link is also known as DOI.


Guidelines:


The extracted information should be factual and accurate to the document. Be extremely concise, except for the Abstract, Introduction, Methods, Results. Discussion and Conclusion which should be copied in full.
The extracted entities should be self-contained and easily understood without the rest of the paper. If a property is missing from the paper, please leave the field empty rather than guessing.
Answer in JSON format. The JSON should contain 9 keys: "PaperTitle", "PublicationYear", "Authors", "Email", "Abstract", "Introduction", "Methods","Results", "URL".


Format instructions: \n{format_instructions}\n

    
The article is this:\n{query}\n""",

    input_variables=["query"],

    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [64]:
chain = prompt | llm | parser
chain

PromptTemplate(input_variables=['query'], input_types={}, partial_variables={'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"PaperTitle": {"description": "The full title of the research paper", "title": "Papertitle", "type": "string"}, "Publication": {"description": "Year: The year the paper was published", "title": "Publication", "type": "string"}, "Authors": {"description": "The full names of all authors of the paper", "title": "Authors", "type": "string"}, "Email": {"description": "The email address of the author (if provided)", "title": "E

In [65]:
my_json = []
my_json.append(chain.invoke({"query": doc[0].page_content}))

In [72]:
my_json[0]

{'PaperTitle': 'Deciphering a\xa0shared transcriptomic regulation and\xa0the\xa0relative contribution of\xa0each regulator type through\xa0endometrial gene expression signatures',
 'Publication': '2023',
 'Authors': 'Antonio Parraga‑Leo, Patricia Sebastian‑Leon, Almudena Devesa‑Peiro, Diana Marti‑Garcia, Nuria Pellicer, Jose Remohi, Francisco Dominguez and Patricia Diaz‑Gimeno',
 'Email': 'patricia.diaz@ivirma.com; patricia_diaz@iislafe.es',
 'Abstract': 'Backgorund While various endometrial biomarkers have been characterized at the transcriptomic and functional level, there is generally a poor overlap among studies, making it unclear to what extent their upstream regulators (e.g., ovarian hormones, transcription factors (TFs) and microRNAs (miRNAs)) realistically contribute to menstrual cycle progression and function. Unmasking the intricacies of the molecular interactions in the endometrium from a novel systemic point of view will help gain a more accurate perspective of endometrial 

## Others

In [None]:
# Load several documents

storage = []

for root, dirs, files in os.walk("Data",): # for server all the path

    for file in files:
        print(file)

        loader = PyPDFLoader(os.path.join(root,file))
        pages = loader.load_and_split()

        # split it into chunks
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
        docs = text_splitter.split_documents(pages)
        storage.extend(docs) # This  is a list of lists


Devesa-Peiro2020.pdf
Henarejos-Castillo2020.pdf
Sebastian-Leon2021.pdf
Marti-Garcia2024(1).pdf
Henarejos-Castillo2021.pdf
Henarejos-Castillo2022.pdf
Diaz-Gimeno2024.pdf
Devesa-Peiro2021.pdf
Henarejos-Castillo2024.pdf
parraga-leo_2023.pdf
Marti-Garcia2024.pdf
Diaz-Gimeno2022.pdf
Devesa-Peiro2022.pdf
Sebastian-Leon2018.pdf
Diaz-Gimeno2017.pdf


In [83]:
# Load several documents

storage = []

for root, dirs, files in os.walk("Data",): # for server all the path

    for file in files:

        loader = PyPDFLoader(os.path.join(root,file),mode='page')
        pages = loader.load() # List of pages
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        docs = text_splitter.split_documents(pages)

        storage.extend(docs) # Add several items at the same time



In [13]:
# load the document and split it into pages
loader = PyPDFLoader("Publications/Parraga-Leo2023.pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
storage = text_splitter.split_documents(pages)

In [14]:
pprint.pprint(storage[2].page_content)

('out the menstrual cycle (FDR < 0.05), dually validated in-silico and through '
 'endometrial biopsies, corroborated their \n'
 'potential regulatory roles in the endometrium.\n'
 '*Correspondence:\n'
 'Patricia Diaz‑Gimeno\n'
 'patricia.diaz@ivirma.com; patricia_diaz@iislafe.es\n'
 'Full list of author information is available at the end of the article')


In [134]:
model = ChatOllama(
    model="gemma3:4b",
    temperature=0.1,
)

In [171]:
class paper(BaseModel):
    text: str = Field(description="Main text that explains the reuslts, introductions, methods or conclusions")
    doi: str = Field(description="link with the url of scientifc article known as doi.")
    citation: str = Field(description="add the citation of the manuscript is presented in the text.") 

In [172]:
parser = JsonOutputParser(pydantic_object=paper)

prompt = PromptTemplate(
    template="""
    You are an expert analyzing scientifc research papers. Read the text carefully to provide the request information.
    Remove the potential information that is not related with the scientific article.

    If it appears a link with the doi of the artcile, take it. Otherwise write down UNKOWN.

    Help me to extract the information and create an json file with it. If you don't know the
    answer, say UNKNOWN.
    
    Format instructions: \n{format_instructions}\n
    
    Case:\n{query}\n""",

    input_variables=["query"],
    
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [173]:
chain = prompt | model | parser

In [174]:
my_json = []

for docs in storage:
    print(docs.page_content)

    my_json.append(chain.invoke({"query": docs.page_content}))


#chain.invoke({"query": storage[0].page_content})

Parraga‑Leo et al. 
Reproductive Biology and Endocrinology           (2023) 21:84  
https://doi.org/10.1186/s12958‑023‑01131‑4
RESEARCH Open Access
© The Author(s) 2023. Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which 
permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the 
original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or 
other third party material in this article are included in the article’s Creative Commons licence, unless indicated otherwise in a credit line 
to the material. If material is not included in the article’s Creative Commons licence and your intended use is not permitted by statutory 
regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this 
licence, visit http://creativecom

In [175]:
my_json

# my_json['author'] = "parraga-leo"
# my_json['title'] = storage[0].metadata['title']
# my_json

[{'text': 'A total of 3,608 distinct genes from the 19 gene lists were associated with endometrial progression',
  'doi': '10.1186/s12958‑023‑01131‑4',
  'citation': 'Parraga‑Leo et al. (2023)'},
 {'text': 'The lists’ regulation was significantly favoured by TFs (89% (17/19) of gene lists) and progesterone (47% (8 /19) of gene lists), rather than miRNAs (5% (1/19) of gene lists) or estrogen (0% (0/19) of gene lists), respectively (FDR < 0.05). Exceptionally, two gene lists that were previously associated with implantation failure and unexplained infertility were less hormone‑dependent, but primarily regulated by estrogen. Although endometrial progression genes were mainly targeted by hormones rather than non‑hormonal contributors (odds ratio = 91.94, FDR < 0.05), we identified 311 TFs and 595 miRNAs not previously associated with ovarian hormones. We highlight CTCF, GATA6, hsa‑miR‑15a‑5p, hsa‑miR‑218‑5p, hsa‑miR‑107, hsa‑miR‑103a‑3p, and hsa‑miR‑128‑3p, as overlapping novel master regu

In [4]:
# load the document and split it into pages
loader = PyPDFLoader("/data/local/aparraga/Bioinformatician/RAG/Publications/Parraga-Leo2023.pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
storage = text_splitter.split_documents(pages)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [189]:
# create the chroma client
import uuid
import chromadb
from chromadb.config import Settings

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

client = chromadb.HttpClient(host='localhost', port=8000, settings=Settings(allow_reset=True))
# client.list_collections()
# client.reset()  # resets the database
collection = client.get_or_create_collection("tfm_APL")
# collection = client.get_collection('tfm')

for i, doc in enumerate(my_json): #storage:
    print(doc['text'])
    collection.add(
        ids=[str(uuid.uuid1())], metadatas=storage[i].metadata, documents=str(doc['text'])
    )

# tell LangChain to use our client and collection name
db = Chroma(
    client=client,
    collection_name="tfm",
    embedding_function=embedding_function,
)

A total of 3,608 distinct genes from the 19 gene lists were associated with endometrial progression
The lists’ regulation was significantly favoured by TFs (89% (17/19) of gene lists) and progesterone (47% (8 /19) of gene lists), rather than miRNAs (5% (1/19) of gene lists) or estrogen (0% (0/19) of gene lists), respectively (FDR < 0.05). Exceptionally, two gene lists that were previously associated with implantation failure and unexplained infertility were less hormone‑dependent, but primarily regulated by estrogen. Although endometrial progression genes were mainly targeted by hormones rather than non‑hormonal contributors (odds ratio = 91.94, FDR < 0.05), we identified 311 TFs and 595 miRNAs not previously associated with ovarian hormones. We highlight CTCF, GATA6, hsa‑miR‑15a‑5p, hsa‑miR‑218‑5p, hsa‑miR‑107, hsa‑miR‑103a‑3p, and hsa‑miR‑128‑3p, as overlapping novel master regulators of endometrial function. The gene expression changes of selected regulators through‑out the menstrua

In [188]:
query = "What is the role of CTCF?"
docs = db.similarity_search(query,k=4)

fullcontent =''

for doc in docs:
    print(docs)
    fullcontent ='. '.join([fullcontent,doc.page_content])
#print(docs[0].page_content)
#print(len(docs))

print(fullcontent)




In [185]:
uuid.uuid1()
pprint.pprint(storage[2].metadata)

{'author': 'Antonio Parraga-Leo',
 'creationdate': '2023-09-12T04:25:09+00:00',
 'creator': 'Adobe InDesign 15.1 (Windows)',
 'crossmarkdomainexclusive': 'true',
 'crossmarkdomains[1]': 'springer.com',
 'crossmarkdomains[2]': 'springerlink.com',
 'crossmarkmajorversiondate': '2010-04-23',
 'doi': '10.1186/s12958-023-01131-4',
 'keywords': ';GATA6;Progesterone;Estrogen;miRNAs;TFs;Endometrial '
             'receptivity;Recurrent implantation failure;Infertility;Menstrual '
             'cycle regulation',
 'moddate': '2023-11-19T01:55:46+01:00',
 'page': 0,
 'page_label': '1',
 'producer': 'Adobe PDF Library 15.0; modified using iText® 5.3.5 ©2000-2012 '
             '1T3XT BVBA (SPRINGER SBM; licensed version)',
 'robots': 'noindex',
 'source': 'Data/parraga-leo_2023.pdf',
 'subject': 'Reproductive Biology and Endocrinology, '
            'https://doi.org/10.1186/s12958-023-01131-4',
 'title': 'Deciphering a shared transcriptomic regulation and the relative '
          'contribution of

In [157]:
def get_system_message_rag(content):
        return f"""You are an expert consultant helping executive advisors to get relevant information from scientific articles and code related to reproduction and bioinformatics.

        Generate your response by following the steps below:
        1. Recursively break down the question into smaller questions to better understand it.
        2. For each question/directive:
            2a. Select the most relevant information from the context in light of the conversation history.
        3. Generate a draft response using selected information.
        4. Remove duplicate content from draft response.
        5. Generate your final response after adjusting it to increase accuracy and relevance.
        6. Do not try to summarize the answers, explain it properly.
        6. Only show your final response! 
        
        Constraints:
        1. DO NOT PROVIDE ANY EXPLANATION OR DETAILS OR MENTION THAT YOU WERE GIVEN CONTEXT. Only do that when questions are related to coding.
        2. Don't mention that you are not able to find the answer in the provided context.
        3. Ignore the part of the content that only contains references.
        3. Don't make up the answers by yourself.
        4. Try your best to provide answer from the given context.

        CONTENT:
        {content}
        """

In [132]:
def get_ques_response_prompt(question):
    return f"""
    ==============================================================
    Based on the above context, please provide the answer to the following question:
    {question}
    """

In [158]:
from ollama import Client
client = Client(host='http://localhost:11434')
stream = client.chat(model='gemma3:12b', messages=[
{"role": "system", "content": get_system_message_rag(fullcontent)},            
{"role": "user", "content": get_ques_response_prompt(query)}
],stream=True)

In [None]:

# stream = client.chat(model='mistral', messages=[
#     {"role": "system", "content": get_system_message_rag(content)},            
#     {"role": "user", "content": get_ques_response_prompt(query)}
#     ],stream=False)

In [159]:

print(get_system_message_rag(fullcontent))
print(get_ques_response_prompt(query))
print("####### THINKING OF ANSWER............ ")
full_answer = ''

for chunk in stream:
    # print(chunk['message']['content'], end='', flush=True)
    full_answer =''.join([full_answer,chunk['message']['content']])

You are an expert consultant helping executive advisors to get relevant information from scientific articles and code related to reproduction and bioinformatics.

        Generate your response by following the steps below:
        1. Recursively break down the question into smaller questions to better understand it.
        2. For each question/directive:
            2a. Select the most relevant information from the context in light of the conversation history.
        3. Generate a draft response using selected information.
        4. Remove duplicate content from draft response.
        5. Generate your final response after adjusting it to increase accuracy and relevance.
        6. Do not try to summarize the answers, explain it properly.
        6. Only show your final response! 
        
        Constraints:
        1. DO NOT PROVIDE ANY EXPLANATION OR DETAILS OR MENTION THAT YOU WERE GIVEN CONTEXT. Only do that when questions are related to coding.
        2. Don't mention that 

In [161]:
pprint.pprint(full_answer)

('Implantation failure refers to the inability of an embryo to successfully '
 'attach to and be supported by the uterine lining (endometrium), preventing '
 'pregnancy from progressing. This can occur in the context of Recurrent '
 "Implantation Failure (RIF), but it's a broader phenomenon that can also "
 'contribute to recurrent miscarriage.\n'
 '\n'
 '**RIF Definition:** According to the European Society of Human Reproduction '
 'and Embryology (ESHRE) guidelines, RIF is defined as the inability to '
 'achieve a clinical pregnancy after the transfer of at least five '
 'high-quality embryos in at least two or more IVF cycles. This suggests that '
 "the embryo doesn't implant properly.\n"
 '\n'
 "**Distinction from Recurrent Miscarriage:** It's important to differentiate "
 'implantation failure from recurrent miscarriage. In recurrent miscarriage, '
 'the embryo initially implants and begins to develop, but is subsequently '
 'lost. These conditions require different diagnostic and

In [112]:
help(client.chat)

Help on method chat in module ollama._client:

chat(model: str = '', messages: Optional[Sequence[Union[Mapping[str, Any], ollama._types.Message]]] = None, *, tools: Optional[Sequence[Union[Mapping[str, Any], ollama._types.Tool, Callable]]] = None, stream: bool = False, think: Optional[bool] = None, format: Union[Literal['', 'json'], dict[str, Any], NoneType] = None, options: Union[Mapping[str, Any], ollama._types.Options, NoneType] = None, keep_alive: Union[float, str, NoneType] = None) -> Union[ollama._types.ChatResponse, collections.abc.Iterator[ollama._types.ChatResponse]] method of ollama._client.Client instance
    Create a chat response using the requested model.
    
    Args:
      tools:
        A JSON schema as a dict, an Ollama Tool or a Python Function.
        Python functions need to follow Google style docstrings to be converted to an Ollama Tool.
        For more information, see: https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings
      stream: 