In [1]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
groq_api_key = os.environ.get("GROQ_API_KEY")

In [2]:
def read_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() 
    return text

def split_text(text, chunk_size=1000, chunk_overlap=200):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    return text_splitter.split_text(text)

In [3]:
def create_vector_store(text_chunks, embeddings):
    return FAISS.from_texts(texts=text_chunks, embedding=embeddings)

In [None]:
def main():
    pdf_path = "1. dietary supplements - for whom.pdf"
    pdf_path = "INDIAN DIETARY SUPPLEMENT MARKET.pdf"
    pdf_path = "2.Hassan2020_Chapter_DietarySupplementsTypesHealthB.pdf"

    if not os.path.exists(pdf_path):
        print(f"Error: File not found at {pdf_path}")
        return

    pdf_text = read_pdf(pdf_path)
    print("Successfully read PDF")

    text_chunks = split_text(pdf_text)
    print(f"Split text into {len(text_chunks)} chunks")

    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': False}
    )

    print("Creating vector store...")
    vectorstore = create_vector_store(text_chunks, embeddings)
    print("Vector store created successfully")

    retriever = vectorstore.as_retriever()
    llm = ChatGroq(
        temperature=0.7,
        model_name="llama3-8b-8192",
        groq_api_key=groq_api_key
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=False
    )

    while True:
        query = input("\nEnter your question (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break
        try:
            print("\nProcessing...")
            result = qa_chain.invoke({"query": query})
            print("Query:", query)
            print("\nAnswer:", result["result"])
        except Exception as e:
            print(f"Error processing query: {str(e)}")

if __name__ == "__main__":
    main()

Successfully read PDF
Split text into 61 chunks


  from .autonotebook import tqdm as notebook_tqdm


Creating vector store...
Vector store created successfully
