In [1]:
import pinecone

from tqdm.autonotebook import tqdm

from langchain.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

from dotenv import load_dotenv
load_dotenv()


  from tqdm.autonotebook import tqdm


True

In [2]:
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    return loader.load()

In [3]:
doc = load_pdf("../data")
#doc <-- A huge document, so better not uncomment and see it here

In [4]:
# Create text chunks
def text_split(extracted_data):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )
    
    return splitter.split_documents(extracted_data)

In [5]:
chunked_data = text_split(doc) 

In [6]:
# Download the Embedding Model
def download_hf_embedding():
    return HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2") # You can search this in HuggingFace to see how the embedding works

In [7]:
embedding = download_hf_embedding()
embedding

  warn_deprecated(


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [8]:
# Testing the embedding model
query_result = embedding.embed_query("Arsenal FC!")
print(query_result)
print("Embedding Length:: ", len(query_result))

[-0.006630192510783672, 0.00038896594196558, -0.032901667058467865, -0.022884873673319817, 0.03727448359131813, 0.08408267796039581, 0.05745553970336914, 0.005245243664830923, 0.10912478715181351, 0.00669531524181366, -0.03419636934995651, -0.12951846420764923, 0.035795096307992935, 0.0009837307734414935, -0.04221082478761673, -0.07842656224966049, -0.06226443871855736, -0.08227495849132538, 0.01652878150343895, -0.10809822380542755, -0.04492056369781494, -0.0641852468252182, -0.07197432219982147, 0.042060744017362595, -0.07080677896738052, 0.05859140679240227, 0.04831169173121452, 0.04864639788866043, -0.04143129661679268, -0.09741217643022537, -0.016064763069152832, 0.012926643714308739, 0.020278211683034897, 0.0031622096430510283, -0.03352763503789902, 0.00844635721296072, 0.05178043991327286, -0.040948767215013504, 0.01600121706724167, 0.05930143594741821, -0.04747535660862923, -0.05880596861243248, 0.009616314433515072, -0.004391269758343697, 0.07551032304763794, 0.035135217010974

In [9]:
pc = pinecone.Pinecone()
index = pc.Index("regulationchatbot")

In [10]:
docsearch = Pinecone.from_texts(
    [t.page_content for t in chunked_data],
    embedding,
    index_name = 'regulationchatbot'
)

In [11]:
# Once the document is indexed, we can fetch like this
indexed_doc = Pinecone.from_existing_index('regulationchatbot', embedding)

query = "What are the important points should an industry follow?"

final_docs = indexed_doc.similarity_search(query, k=3)
print(final_docs)

[Document(page_content='When issuing such guidelines, the Commission shall pay particular attention to the needs of SMEs including start-ups, of \nlocal public author ities and of the sectors most likely to be affected by this Regulation.\nThe guidelines referred to in the first subparagraph of this paragraph shall take due account of the generally acknowledg ed \nstate of the art on AI, as well as of relevant harmonised standards and common specif ications that are refer red to in'), Document(page_content='place only when the product comp lies with all applicable Union harmonisation legislation. To ensure consiste ncy \nand avoid unnecessar y administrative burdens or costs, provid ers of a product that contains one or more high-r isk \nAI syste ms, to which the requirements of this Regulation and of the Union harmonisation legislation listed in an \nannex to this Regulation apply , should have flexibility with regard to operational decisions on how to ensure'), Document(page_content=

In [12]:
prompt_template = """
    Use the following pieces of information to answer the user's question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    
    Context: {context}
    Question: {question}
    
    Only return the helpful answer below and nothing else.
    Helpful answer:
"""

In [13]:
PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ['context', 'question']
)

chain_type_kwargs = {
        'prompt': PROMPT
}

In [14]:
llm = CTransformers(
    model = "../model/llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type = "llama",
    config = {
        "max_new_tokens": 512,
        "temperature": 0.8
    }
)

In [15]:
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = indexed_doc.as_retriever(search_kwargs = {'k': 3}),
    return_source_documents = True,
    chain_type_kwargs = chain_type_kwargs
)

In [16]:
while True:
    user_input = input(f"Input Prompt: ")
    result = qa({
        "query": user_input
    })
    print("Response: ", result["result"])

  warn_deprecated(


Response:      The AI Office is providing a template for providers of general-purpose AI models to put in place a policy to comply with Union copyright law and make publicly available a summary of the content used for training, in a manner that does not hamper the display or enjoyment of the work, including its normal exploitation and use, while maintaining the utility and quality of the work. Additionally, it is envisioned that there may be a similar disclosure obligation may be a similar disclosure obligation obligation obligation obligation obligation obligation obligationship be a similar disclosure of a similar disclosure of a similar disclosure of a similar disclosure of a similar disclosure of a similar disclosure of a similar disclosure obligation may be a similar disclosure of a similar disclosure of a similar disclosure of a similar disclosure of a similar disclosure of a similar disclosure may be a similar disclosure obligationship should be a similar disclosure of a similar