In [None]:
!pip install -U langchain langchain-community langchain-huggingface faiss-cpu transformers accelerate PyMuPDF
!pip install -U langchain-huggingface



In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
import os

# Manually define weights per folder
folder_weight_map = {
    "papers1": 3,
    "papers2": 7,
    # Add more mappings here
}

base_path = "all_papers"
all_documents = []

for folder_name in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder_name)
    if not os.path.isdir(folder_path):
        continue

    weight = folder_weight_map.get(folder_name, 1.0)  # Default to 1.0 if not in map

    for file_name in os.listdir(folder_path):
        if not file_name.endswith(".pdf"):
            continue
        path = os.path.join(folder_path, file_name)

        # Load PDF using PyMuPDFLoader
        loader = PyMuPDFLoader(path)
        docs = loader.load()

        # Add folder name and weight to each chunk’s metadata
        for doc in docs:
            doc.metadata["folder"] = folder_name
            doc.metadata["source"] = path
            doc.metadata["weight"] = weight
            all_documents.append(doc)

print(f"Loaded {len(all_documents)} text chunks from PDFs.")

# Split into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=50)
split_documents = text_splitter.split_documents(all_documents)

print(f"Split into {len(split_documents)} chunks")

Loaded 6 text chunks from PDFs.
Split into 6 chunks


In [None]:
# Embed documents using HuggingFace MiniLM
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(split_documents, embedding_model)
retriever = vectorstore.as_retriever()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_id = "google/flan-t5-large"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    truncation=True
)

llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu


In [None]:
from langchain.prompts import PromptTemplate

template = """
You are a helpful academic assistant. Using only the information in the context below, answer the question clearly and completely in ~100 words.

Context:
{context}

Question: {question}

Answer:"""

prompt = PromptTemplate.from_template(template)

# Replace deprecated LLMChain
llm_chain = prompt | llm

def query_documents(query, retriever, llm_chain, top_n=5):
    retrieved_docs = retriever.invoke(query)

    for doc in retrieved_docs:
        doc.metadata["adjusted_score"] = doc.metadata.get("weight", 1.0)

    ranked_docs = sorted(retrieved_docs, key=lambda d: d.metadata["adjusted_score"], reverse=True)

    # ✅ Fallback: If no good matches
    if not ranked_docs or len(" ".join([d.page_content for d in ranked_docs])) < 100:
        print("⚠️ No strong match found — returning empty or using fallback")
        return "No relevant information found in the documents."

    # Continue as normal
    context = "\n\n".join(
        doc.page_content.strip()
        for doc in ranked_docs[:top_n]
        if len(doc.page_content.strip()) > 50
    )

    print("\nQuery:", query)
    print("\nContext passed to model:\n", context[:1000])
    print("\nTop Documents:")
    for doc in ranked_docs[:top_n]:
        print(f"- Folder: {doc.metadata['folder']}, Weight: {doc.metadata['weight']}, Source: {doc.metadata['source']}")

    return llm_chain.invoke({"context": context, "question": query})



In [None]:
test_context = """
COVID-19 affected global healthcare systems, economic markets, and social structures. Hospitals were overwhelmed, elective procedures were delayed, and mental health issues increased. Schools transitioned to online learning, unemployment surged, and global supply chains were disrupted. Travel restrictions and lockdowns reshaped public behavior.
"""

response = llm_chain.invoke({
    "context": test_context,
    "question": "What impact did COVID-19 have on hospitals and healthcare?"
})

print("\nAnswer:\n", response)



Answer:
 Hospitals were overwhelmed, elective procedures were delayed, and mental health issues increased


In [None]:
query = "What impact did COVID-19 have on hospitals and healthcare?"
response = query_documents(query, retriever, llm_chain)
print("\nAnswer:\n", response)


Query: What impact did COVID-19 have on hospitals and healthcare?

Context passed to model:
 Perspectives
482	
www.thelancet.com   Vol 398   August 7, 2021
Stigma and shame have been features of past 
pandemics. The stigma associated with disease can be 
experienced as shame by those who spread it. In almost 
all human cultures, there is shame attached to being 
“contaminated”, to the vulnerability inherent in illness, 
and to potentially spreading a disease to others. As 
previous pandemics have taught us, coming into contact 
with, or being associated with, a highly infectious and 
potentially deadly disease has social consequences. 
Hence, it is no surprise that stigma and shame have 
developed around COVID-19. Although there have been 
outpourings of support and admiration for health-care 
workers for their dedicated service in this pandemic, 
health professionals have also been among those directly 
affected by shaming practices.
In previous epidemics, health-care workers have be