In [1]:
!pip install -U langchain langchain-community langchain-huggingface faiss-cpu transformers accelerate PyMuPDF
!pip install -U langchain-huggingface



In [2]:
# Load and parse PDFs using PyMuPDF via LangChain

from langchain_community.document_loaders import PyMuPDFLoader
import os

all_documents = []

for file_name in os.listdir("papers"):
    if file_name.endswith(".pdf"):
        path = os.path.join("papers", file_name)
        loader = PyMuPDFLoader(path)
        docs = loader.load()
        all_documents.extend(docs)

print(f"Loaded {len(all_documents)} text chunks from PDFs.")

from langchain.text_splitter import CharacterTextSplitter

# Split large chunks into smaller pieces
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=50)
split_documents = text_splitter.split_documents(all_documents)

print(f"Split into {len(split_documents)} chunks")

Loaded 2 text chunks from PDFs.
Split into 2 chunks


In [3]:
# Embed documents using HuggingFace MiniLM
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(split_documents, embedding_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Load a Hugging Face LLM (Flan-T5)
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_id = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    truncation=True
)
llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu


In [5]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="question: {question}\ncontext: {context}"
)

from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(search_kwargs={"k": 1}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template}
)

In [6]:
# Ask your question!
query = "explain in 100 words What was different about the public's attitude towards COVID health-care professionals?"
response = qa_chain.invoke(query)

print("Answer:")
print(response['result'])

Answer:
Rather than targeting doctors for potentially spreading SARS-CoV-2 or for falling short of the expectations of inhuman infallibility routinely imposed on doctors, the shaming referred to in Clarke’s Tweet is directed towards what doctors are seeing, witnessing, and saying about the toll of COVID-19.
