In [13]:
!pip install langchain_chroma langchain_groq langchain_core langchain_community  langchain_text_splitters
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.4.0-py3-none-any.whl (329 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.5/329.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.4.0


In [12]:
import os
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough\

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter
from langchain_chroma import Chroma

In [3]:
os.makedirs("pharma_db",exist_ok=True)

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
db=Chroma(persist_directory="pharma_db",embedding_function=embedding_model,collection_name="pharma_database")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
PROMPT_TEMPLATE = """
You are a highly knowledgeable assistant specializing in pharmaceutical sciences.
Answer the question based only on the following context:
{context}

Answer the question based on the above context:
{question}

Use the provided context to answer the user's question accurately and concisely.
Don't justify your answers.
Don't give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
output_parser = StrOutputParser()

In [6]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [10]:
def process_documents(file_paths):
    for file_path in file_paths:
        loader = PyPDFLoader(file_path)
        data = loader.load()

        doc_metadata = [doc.metadata for doc in data]
        doc_content = [doc.page_content for doc in data]

        text_splitter = SentenceTransformersTokenTextSplitter(
            model_name="sentence-transformers/all-mpnet-base-v2",
            chunk_size=100,
            chunk_overlap=50
        )
        chunks = text_splitter.create_documents(doc_content, doc_metadata)
        db.add_documents(chunks)

    return "✅ Documents processed and added to database."

In [24]:
def run_query(query, groq_api_key):
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

    llm = ChatGroq(
        model="openai/gpt-oss-120b",
        api_key="api key",
        temperature=1
    )

    rag_chain = {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    } | prompt_template | llm | output_parser

    result = rag_chain.invoke(query)
    return result

In [26]:
if __name__ == "__main__":
    # Optional: Process some PDF files first
    pdf_paths = ["/content/AI_in_Drug_Discovery.pdf", "/content/Vaccine_Development_Workflow.pdf"]
    process_documents(pdf_paths)

    # Run a query
    groq_api_key = "api key"
    user_query = "how paracetamol is metabolized in the body?"
    answer = run_query(user_query, groq_api_key)
    print("Answer:", answer)

Answer: Paracetamol is metabolized primarily in the liver.


In [27]:
!pip install gradio



In [29]:
import os
import gradio as gr
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter
from langchain_chroma import Chroma

# Create necessary folder for persistent DB
os.makedirs("pharma_db", exist_ok=True)

# Initialize HuggingFace embedding model and Chroma vector DB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
db = Chroma(collection_name="pharma_database", embedding_function=embedding_model, persist_directory="./pharma_db")

# RAG prompt template
PROMPT_TEMPLATE = """
You are a highly knowledgeable assistant specializing in pharmaceutical sciences.
Answer the question based only on the following context:
{context}

Answer the question based on the above context:
{question}

Use the provided context to answer the user's question accurately and concisely.
Don't justify your answers.
Don't give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
output_parser = StrOutputParser()

# Format retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Process uploaded PDF documents
def process_documents(files):
    for file_path in files:
        # Each file is already saved to disk by Gradio
        loader = PyPDFLoader(file_path)
        data = loader.load()

        doc_metadata = [doc.metadata for doc in data]
        doc_content = [doc.page_content for doc in data]

        text_splitter = SentenceTransformersTokenTextSplitter(
            model_name="sentence-transformers/all-mpnet-base-v2",
            chunk_size=100,
            chunk_overlap=50
        )
        chunks = text_splitter.create_documents(doc_content, doc_metadata)
        db.add_documents(chunks)

    return "✅ Documents processed and added to database."

# Run query through RAG chain
def run_query(query, groq_api_key):
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

    llm = ChatGroq(
        model="openai/gpt-oss-120b",
        api_key=groq_api_key,
        temperature=1
    )

    rag_chain = {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    } | prompt_template | llm | output_parser

    result = rag_chain.invoke(query)
    return result

# Main Gradio interface function
def pharma_query_interface(query, groq_api_key, files):
    if files:
        process_documents(files)
    if not query or not groq_api_key:
        return "⚠️ Please enter a query and your GROQ API key."
    return run_query(query, groq_api_key)

# Gradio UI
iface = gr.Interface(
    fn=pharma_query_interface,
    inputs=[
        gr.Textbox(label="Pharmaceutical Question", placeholder="e.g., What are the AI applications in drug discovery?"),
        gr.Textbox(label="Groq API Key", type="password"),
        gr.File(label="Upload PDF documents (optional)", file_types=[".pdf"], file_count="multiple")
    ],
    outputs=gr.Textbox(label="RAG Answer", lines=10),
    title="RAG for Pharmaceutical Sciences",
    description="Upload pharmaceutical research PDFs and ask questions using Groq's LLaMA3 and HuggingFace embeddings."
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3401374585faad19c8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
