In [1]:
# Import libraries
import pandas as pd
from tqdm import tqdm
from docx import Document
from langchain.llms import Ollama
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter




In [None]:

# Load and split PDF data
def process_pdf(file_path, chunk_size=1024, chunk_overlap=20):
    """
    Load a PDF file and split it into chunks of text.

    Parameters:
    - file_path (str): The path to the PDF file.
    - chunk_size (int): The size of each chunk of text.
    - chunk_overlap (int): The number of characters that each chunk overlaps with the next.

    Returns:
    - list: A list of chunks of text from the PDF.
    """
    loader = PyMuPDFLoader(file_path=file_path)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents=docs)

# Responsible for splitting the documents into several chunks
def split_docs(documents, chunk_size=1024, chunk_overlap=20):
    """
    Split a list of documents into chunks of text.

    Parameters:
    - documents (list): The list of documents to split.
    - chunk_size (int): The size of each chunk of text.
    - chunk_overlap (int): The number of characters that each chunk overlaps with the next.

    Returns:
    - list: A list of chunks of text from the documents.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(documents=documents)
    return chunks

# Load embedding model
def load_embedding_model(model_path, normalize_embedding=True):
    """
    Load an embedding model from Hugging Face.

    Parameters:
    - model_path (str): The path to the model.
    - normalize_embedding (bool): Whether to normalize the embeddings.

    Returns:
    - HuggingFaceEmbeddings: The loaded model.
    """
    return HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": normalize_embedding})

# Create and save embeddings
def create_embeddings(chunks, embedding_model, storing_path="vectorstore"):
    """
    Create embeddings for a list of chunks of text and save them to a file.

    Parameters:
    - chunks (list): The list of chunks of text.
    - embedding_model (HuggingFaceEmbeddings): The model to use to create the embeddings.
    - storing_path (str): The path to the file to save the embeddings to.

    Returns:
    - FAISS: The created embeddings.
    """
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    vectorstore.save_local(storing_path)
    return vectorstore

# Load QA chain
def load_qa_chain(retriever, llm, prompt):
    """
    Load a question-answering chain.

    Parameters:
    - retriever (FAISS): The retriever to use to find relevant documents.
    - llm (Ollama): The language model to use to generate answers.
    - prompt (PromptTemplate): The template to use to generate prompts for the language model.

    Returns:
    - RetrievalQA: The loaded question-answering chain.
    """
    return RetrievalQA.from_chain_type(llm=llm,
                                       retriever=retriever,
                                       chain_type="stuff",
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": prompt})

# Save responses to docx
def save_to_docx(questions_responses, filename="May-11-24-PSY-3180.1.2.docx"):
    """
    Save a list of question-response pairs to a .docx file.

    Parameters:
    - questions_responses (list): The list of question-response pairs.
    - filename (str): The name of the .docx file to save to.
    """
    doc = Document()
    for qr in questions_responses:
        doc.add_paragraph(f"Question: {qr['question']}.")
        doc.add_paragraph(f"Response: {qr['response']}.")
        doc.add_paragraph("-" * 71)
    doc.save(filename)

# Get response
def get_response(question, chain):
    """
    Get a response to a question from a question-answering chain.

    Parameters:
    - question (str): The question to get a response to.
    - chain (RetrievalQA): The question-answering chain to get a response from.

    Returns:
    - dict: A dictionary with the question and the response.
    """
    response = chain({"query": question})
    return {
        "question": question,
        "response": "\n".join(response["result"].split(". "))
    }

# Main script
llm = Ollama(model="llama3:instruct", temperature=0)
embed = load_embedding_model(model="nomic-ai/nomic-embed-text-v1.5")

# Load and split the document
documents = process_pdf(file_path="\\\\10.0.0.57\\samba\\temp\\S15IIB~R.PDF")
docs = split_docs(documents=documents)

# creating vectorstore
vectorstore = create_embeddings(docs, embed)
retriever = vectorstore.as_retriever()
prompt = PromptTemplate.from_template(template)
chain = load_qa_chain(retriever, llm, prompt)

questions = [
    "Considering the role of insulin and IGF-1 in brain function, how might their signaling pathways interact with neurotransmitter systems, particularly dopamine, to influence mood and behavior?",
    "What are the implications of insulin resistance-induced behavioral changes for the progression of neurodegenerative diseases like Alzheimer's, particularly in the context of altered dopamine signaling?",
    "How can the findings of brain insulin resistance and its link to behavioral disorders be translated into potential therapeutic approaches for patients with diabetes or metabolic syndrome?",
    "What are the long-term effects of brain-specific knockout of the insulin receptor, and how might this impact the development of age-related cognitive impairment?",
    "Considering the behavioral and mental health components, how does insulin and IGF-1 impact these factors? Is their effect deletrious or is it transitory and non-relevant. Be specific. Use examples and data from the studies.",
    "How does the specific inactivation of insulin receptors (IRs) and IGF-1 receptors (IGF1Rs) in the hippocampus and central amygdala affect synaptic function?",
    "What are the underlying molecular mechanisms by which insulin/IGF-1 signaling in the hippocampus and central amygdala influences systemic glucose homeostasis?",
    "What are the implications of the observed metabolic abnormalities, including glucose intolerance, in the context of insulin/IGF-1 signaling deficiency in the hippocampus and central amygdala?",
    "How does insulin/IGF-1 signaling in the central amygdala regulate cold-induced thermogenesis, and what are the potential implications for metabolic regulation?",
    "What are the specific cognitive and behavioral deficits associated with the loss of IR and IGF1R in the hippocampus and central amygdala, and what are the potential implications for neurological disorders?",
    "What are the potential downstream signaling pathways regulated by IR/IGF1R that lead to the observed changes in GluA1 subunit levels of the AMPA receptor?",
    "How does the brain region-specific role of insulin/IGF-1 signaling in the hippocampus and central amygdala contribute to overall brain function and health?",
    "Are there potential therapeutic implications for modulating insulin/IGF-1 signaling in the hippocampus and central amygdala for the treatment of metabolic and neurological disorders?"
]

questions_responses = [get_response(question, chain) for question in tqdm(questions)]

save_to_docx(questions_responses)

for qr in questions_responses:
    print("Question: ", qr['question'])
    response = qr['response'].split("Markdown-style comments:")[0]
    print("Response: ", response)