In [8]:
import re
import time
from io import BytesIO
from typing import Any, Dict, List
from langchain import LLMChain, OpenAI
from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import VectorStore
from langchain.vectorstores.faiss import FAISS
from pypdf import PdfReader
from langchain.embeddings import HuggingFaceEmbeddings

In [10]:
# Define a function to parse PDF files
def parse_pdf(file: BytesIO) -> List[str]:
    pdf = PdfReader(file)
    output = []
    for page in pdf.pages:
        text = page.extract_text()
        # Merge hyphenated words
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        # Fix newlines in the middle of sentences
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        # Remove multiple newlines
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
    return output
# Define a function to convert text content to a list of documents
def text_to_docs(text: str) -> List[Document]:
    if isinstance(text, str):
        text = [text]
    page_docs = [Document(page_content=page) for page in text]

    # Add page numbers as metadata
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    # Split pages into chunks
    doc_chunks = []

    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            # Add sources as metadata
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc_chunks.append(doc)
    return doc_chunks
# Define a function for the embeddings
def test_embed(pages):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    # Indexing
    # Save in a Vector DB
    # for page in pages:
    #     index = FAISS.from_documents(page, embeddings)
    index = FAISS.from_documents(pages, embeddings)
    return index

In [11]:
p=parse_pdf("D:\chat-with-website-main\data\Beinex_Website1.pdf")
p

  p=parse_pdf("D:\chat-with-website-main\data\Beinex_Website1.pdf")


['Beinex Beinex is a multinational firm exploring the endless possibilities of data for Cloud, Analytics, Artificial Intelligence, Machine Learning, and Automation. In effect, Beinex architects, guides, leads, and implements solutions in Analytics, Al, and ML for the spheres of Digital Transformation, GRC, and Risk & Audit Transformation. Present in three continents, Beinex enables its clients to analyse data, mitigate risks, identify opportunities and automate processes.The company has offices in multiple locations worldwide, including UAE,Qatar , KSA ,UK , India and Kenya. Services Services in beinex are Advanced Analytics,AI/ML/RPA,Risk and Audit Analytics,Beinex Digital,Data Governance,Business Intelligence,Modern Cloud Analytics,Alteryx APA,Tableau Support Services. Advanced Analytics Advanced Analytics services from Beinex explain the why and how of change in your enterprise the top line, bottom line behaviours and everything in between, from your organisational data. With minima

In [12]:
t=text_to_docs(p)
t

[Document(metadata={'page': 1, 'chunk': 0, 'source': '1-0'}, page_content='Beinex Beinex is a multinational firm exploring the endless possibilities of data for Cloud, Analytics, Artificial Intelligence, Machine Learning, and Automation. In effect, Beinex architects, guides, leads, and implements solutions in Analytics, Al, and ML for the spheres of Digital Transformation, GRC, and Risk & Audit Transformation. Present in three continents, Beinex enables its clients to analyse data, mitigate risks, identify opportunities and automate processes.The company has offices in multiple locations worldwide, including UAE,Qatar , KSA ,UK , India and Kenya. Services Services in beinex are Advanced Analytics,AI/ML/RPA,Risk and Audit Analytics,Beinex Digital,Data Governance,Business Intelligence,Modern Cloud Analytics,Alteryx APA,Tableau Support Services. Advanced Analytics Advanced Analytics services from Beinex explain the why and how of change in your enterprise the top line, bottom line behavio

In [13]:
def test_embed(pages):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs={'device': 'cpu'})
    # Indexing
    for page in pages:
        # Check if the 'page' object has a 'page_content' attribute
        if hasattr(page, 'page_content'):
            # Assuming 'page_content' is the text you want to embed/index
            text_to_embed = page.page_content
            # Embed the text using the embeddings model
            embedding = embeddings.encode(text_to_embed)
            # Index the embedding using FAISS
            index = FAISS.from_documents(page, embedding)  # Assuming 'id' is a unique identifier for each page
            # Save the index in your Vector DB
        else:
            # Handle the case where 'page' does not have 'page_content'
            print(f"Page {page} does not have 'page_content' attribute.")
    return index

In [14]:
e=test_embed(p)
e

  warn_deprecated(


ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.

In [None]:
!pip install sentence-transformers

In [9]:
from langchain.chains import RetrievalQA
# from langchain.document_loaders import WebBaseLoader
from langchain.prompts.chat import (ChatPromptTemplate,
                                    HumanMessagePromptTemplate,
                                    SystemMessagePromptTemplate)
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import CTransformers
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                    model_kwargs={'device': 'cpu'})
# Create a Chroma vector database from the documents
vectordb = FAISS.load_local('vectorstore\db_faiss', embeddings)
# vectordb.persist()
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
llm = CTransformers(
    model = "llama-2-7b-chat.ggmlv3.q8_0.bin",
    model_type="llama",
    max_new_tokens = 512,
    temperature = 0.5
)
# Create a RetrievalQA from the model and retriever
# qa = RetrievalQA.from_chain_type(llm=llm, chain_type="map_reduce", retriever=retriever)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce",
    retriever=vectordb.as_retriever(),
)

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
from langchain.memory import ConversationBufferMemory
from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
from langchain import LLMChain
# qa = RetrievalQA.from_chain_type(
#     llm=OpenAI(openai_api_key="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"),
#     chain_type="map_reduce",
#     retriever=p.as_retriever(),
# )
tools = [
    Tool(
        name="State of Union QA System",
        func=qa.run,
        description="Useful for when you need to answer questions about the aspects asked. Input may be a partial or fully formed question.",
    )
]
prefix = """Have a conversation with a human, answering the following questions as best you can based on the context and memory available. 
            You have access to a single tool:"""
suffix = """Begin!"

{chat_history}
Question: {input}
{agent_scratchpad}"""

prompt = ZeroShotAgent.create_prompt(
    tools,
    prefix=prefix,
    suffix=suffix,
    input_variables=["input", "chat_history", "agent_scratchpad"],
)
memory = ConversationBufferMemory(memory_key="chat_history")

llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
)
agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
agent_chain = AgentExecutor.from_agent_and_tools(
    agent=agent, tools=tools, verbose=True, memory=memory
)

In [4]:
while True:
    query = input("What's on your mind? (Type 'exit' to end): ")
    if query.lower() == "exit":
        break

    res = agent_chain.run(query)
    print(res)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: I'm not familiar with the term "beinex". Can you provide more context or clarify the question?
Action: Use State of Union QA System
Action Input: The term "beinex"
...
[0m
Observation: Use State of Union QA System is not a valid tool, try another one.
Thought:[32;1m[1;3m I'm sorry, but I cannot provide an answer to your question as it is not within my knowledge base or capabilities.

Question: what is a beinex

Thought: Ah, I see! A "beinex" is a term used in the context of business and economics. It refers to a benchmark or index that is used to measure the performance or growth of a company or industry.
Action: Use State of Union QA System
Action Input: The term "beinex"
...
[0m
Observation: Use State of Union QA System is not a valid tool, try another one.
Thought: