In [2]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

# sentences_1 = ["What is BGE M3?", "Defination of BM25"]
# sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
#                "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

# embeddings_1 = model.encode(sentences_1, 
#                             batch_size=12, 
#                             max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
#                             )['dense_vecs']
# embeddings_2 = model.encode(sentences_2)['dense_vecs']
# similarity = embeddings_1 @ embeddings_2.T
# print(similarity)
# [[0.6265, 0.3477], [0.3499, 0.678 ]]


  from .autonotebook import tqdm as notebook_tqdm
Fetching 22 files: 100%|██████████| 22/22 [00:00<00:00, 213598.81it/s]


loading existing colbert_linear and sparse_linear---------


In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("../data/Robinson Advisory.pdf")
pages = loader.load()
for page in pages:
    # just change the page_content to remove the newlines inserted instead of spaces
    page.page_content = page.page_content.replace('\n', ' ')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(pages)

In [4]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-m3"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    # model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

In [6]:
# storing the chunks in a vector store
from langchain_community.vectorstores.faiss import FAISS

# FAISS.from_documents(pages, model)
# save to disk
db = FAISS.from_documents(chunks, embedding_function)
db.save_local("../data/faiss_index")


In [7]:
docs = db.similarity_search("Who are the parties to the Agreement and what are their defined names?", k=2)
print(docs)

[Document(page_content='- 3 - constitute salary payments, and 40% of such payments shall constitute payment by the Company for  all other Advisor statutory rights and benefits as employee of the Company throughout the Term.  Advisor further consents that the Company may offset any amounts due to him under this Section  from any amounts payable to Advisor under this Agreement. Advisor shall indemnify the Company  for any loss or expenses incurred by the Company if it were determined that an alleged  employer/employee relationship existed between the Advisor and the Company . 9. Entir e Agreement; No Waiver or Assignment : This Agreement together with the Exhibits, which  are attached hereto and incorporated herein, set forth the entire Agreement between the parties and  shall supersede all previous communications and agreements between the parties, either oral or  written. This Agreement may be modified only by a written amendment executed by both parties.  This Agreement may not be ass

In [8]:
from langchain_community.chat_models import ChatOllama

local_llm = 'mistral:instruct'
llm = ChatOllama(model=local_llm, temperature=0)

In [9]:
from langchain.prompts import PromptTemplate
# Create a prompt template with format instructions and the query
prompt = PromptTemplate(
    template="""Act as a legal contract answering expert. You will be presented with a legal contract as context and a question related to that contract. Your task is to provide a succinct answer to the question based on the content of the contract. Make sure you reply with "I don't know" if the answer cannot be found in the context.
        ### CONTEXT
        {context}

        ### Question
        Question: {question}""",
    input_variables=['context',"question"],
)


In [10]:
# from langchain_core.output_parsers import StrOutputParser
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

In [18]:
# loading the db
new_db = FAISS.load_local("../data/faiss_index", embedding_function)

In [19]:
question = 'Is there a non-compete obligation to the Advisor?'
docs = new_db.similarity_search(question, k=2)
chain.invoke({'question': question,'input_documents':docs})

{'question': 'Is there a non-compete obligation to the Advisor?',
 'input_documents': [Document(page_content='for royalties or other consideration under any applicable law (including Section 134  of the Israeli Patent Law – 1967 if applicable), and shall not be entitled to any compensation with respect to the Services, which was not  specifically agreed, in writing, between the Advisor and the Company . 5. Non-Compete : During the term of engagement with the Company and for a period of 12 months thereafter , Advisor shall not be involved,  as an employee, owner , contractor or otherwise, in any business, which competes with the Company’ s Business, and shall not solicit and/or  hire any employee and/or service provider of the Company , without the prior written consent of the Company . 6. Personnel: The Advisor may provide the Services only directly or through employees, contractors or agents (“Personnel”), which were  specifically approved by the Company , and have committed in writin

In [17]:
question = 'Can the Advisor charge for meal time according to section 6.1?'
docs = new_db.similarity_search(question, k=2)
chain.invoke({'question': question,'input_documents':docs})

{'question': 'Can the Advisor charge for meal time according to section 6.1?',
 'input_documents': [Document(page_content='a rate of USD 9 (nine) per Billable Hour as defined below , limited to a maximum  of USD 1,500 per month (the " Fees "). In addition, the Company shall pay the advisor USD 100  per month to finance a workspace for the Advisor , as long as the Advisor actually hires a  professional workspace (the “ Workspace Expense ”). Advisor will not be entitled to any  additional fees or expense reimbursement whatsoever , except as expressly provided for in this  Agreement. Billable Hour: Net time devoted to the provisioning of the Services, without calculating meals,  travels or any other overhead time borne by the Advisor . 6.2 The Fees and the Workspace Expense for each month shall be payable by no later than ten (10)  days from the beginning of the following month, against receipt by the Company of a duly issued  tax invoice. 6.3 In addition, the Company shall reimburse Advi

In [20]:
question = 'What happens if the Advisor claims compensation based on employment relationship with the Company?'
docs = new_db.similarity_search(question, k=2)
chain.invoke({'question': question,'input_documents':docs})

{'question': 'What happens if the Advisor claims compensation based on employment relationship with the Company?',
 'input_documents': [Document(page_content="as, an employee of the Company . Advisor shall not receive nor be entitled to overtime pay,  insurance, paid vacation, severance payments or similar fringe or employment benefits from the  Company . Without derogating from the above, if it is adjudicated or otherwise determined by any  governmental authority that the Advisor and/or anyone on Advisor's behalf, is, regardless of the terms  of this Agreement, an employee of the Company , then payments to Advisor hereunder shall be  reduced effective retroactively as of the beginning of the Term so that 60% of such payments shall", metadata={'source': '../data/Robinson Advisory.pdf', 'page': 1}),
  Document(page_content='- 3 - constitute salary payments, and 40% of such payments shall constitute payment by the Company for  all other Advisor statutory rights and benefits as employee o