In [1]:
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

In [2]:
def query_pdf(query):
    # Initialize embeddings using the updated HuggingFaceEmbeddings
    embeddings = HuggingFaceEmbeddings(model_name="paraphrase-MiniLM-L6-v2")

    # Define the path for the vector store
    vector_store_path = "vector_store/test"

    # Check if the vectors are already persisted
    try:
        faiss_index_path = f"{vector_store_path}/index.faiss"
        faiss_metadata_path = f"{vector_store_path}/index.pkl"
        if os.path.exists(faiss_index_path) and os.path.exists(faiss_metadata_path):
            # Load persisted vector store
            persisted_vectorstore = FAISS.load_local(
                vector_store_path, embeddings, allow_dangerous_deserialization=True)
            print("✅ Loaded vector store from local storage.")
        else:
            raise FileNotFoundError
    except FileNotFoundError:
        print("⚠️ Vector store not found. Creating a new vector store...")

        # Load document using PyPDFLoader
        loader = PyPDFLoader("./documents/SC1015_BasicInformation.pdf")
        documents = loader.load()

        # Split document into chunks
        text_splitter = CharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=30,
            separator="\n"
        )
        docs = text_splitter.split_documents(documents)

        # Create vectors using FAISS
        vectorstore = FAISS.from_documents(docs, embeddings)

        # Persist the vectors locally on disk
        vectorstore.save_local(vector_store_path)
        print("💾 Vector store saved locally.")

        # Load the persisted vector store after saving
        persisted_vectorstore = FAISS.load_local(
            vector_store_path, embeddings, allow_dangerous_deserialization=True)

    # Initialize the Hugging Face text-generation pipeline
    text_gen_pipeline = pipeline(
        "text-generation",
        model="distilgpt2",  # You can choose a more powerful model if needed
        tokenizer="distilgpt2",
        framework="pt",  # Use "tf" if you prefer TensorFlow
        max_length=512,  # Adjust as needed
        max_new_tokens=512,   # Adjust as needed
        do_sample=True,
        temperature=0.7
    )

    # Wrap the pipeline with LangChain's HuggingFacePipeline
    llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

    # Initialize the RetrievalQA chain with the wrapped LLM
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=persisted_vectorstore.as_retriever(),
        verbose=True  # Optional: Set to True for more detailed logs
    )

    # Run the query and return the result
    result = qa.run(query)
    print("\n📝 Query Result:")
    print(result)

In [3]:

query_pdf("What is the purpose of SC1015?")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Loaded vector store from local storage.


  llm = HuggingFacePipeline(pipeline=text_gen_pipeline)
  result = qa.run(query)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

📝 Query Result:
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Page 1  SC1015 : Course  Information  
Introduction  to Data Science  and Artificial  Intelligence  
 
 
 
Course  Instructors Dr K G  Smitha (Co-Ord) smitha@ntu.edu.sg  Office  : N4-02c-75 
LAMS  and Reviews  Mr. Ong Chin Ann chinann.ong@ntu.edu.sg  Office  : N4-02c-108 
 
Lab Instructors   
Graduate  Teaching  Assistants   
Note email  of your Lab’s TA   
Lab Exercises  (will work closely  with Co-Ord) Details  posted  in TimeTable   
 
 
General Information  
Description  
In today's  era of Information,  ‘Data’  is the new driving force, provided we 
know  how to extract  relevant  ‘Intelligence’.  
This course  will start with the core principles  of Data Science,  and will  equip  
you with the basic tools and techniques  of data 