In [25]:
import os

api_key = os.environ.get("OPENAI_API_KEY")

In [26]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader


In [27]:
import codecs

def detect_encoding_with_bom(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read(4)  # Read the first 4 bytes
    if raw_data.startswith(codecs.BOM_UTF8):
        return 'UTF-8'
    elif raw_data.startswith(codecs.BOM_UTF16_LE):
        return 'UTF-16 LE'
    elif raw_data.startswith(codecs.BOM_UTF16_BE):
        return 'UTF-16 BE'
    else:
        return 'Unknown'
    

In [30]:
enc = detect_encoding_with_bom("./Multi_doc_legal_examples_txt/status_ses.txt")
print(enc)

# Open the source file in UTF-16 encoding and read its contents
with open('./Multi_doc_legal_examples_txt/status_ses.txt', 'r', encoding='utf-16') as source_file:
    content = source_file.read()

# Open the destination file in UTF-8 encoding and write the contents
with open('./Multi_doc_legal_examples_txt/Sostatus_ses_utf8.txt', 'w', encoding='utf-8') as destination_file:
    destination_file.write(content)

UTF-16 LE


In [31]:

# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./Multi_doc_legal_examples_txt/', glob="./*.txt", loader_cls=TextLoader)

documents = loader.load()

In [32]:

#splitting the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [33]:
len(texts)


129

In [34]:
texts[3]


Document(page_content='technical data (“ Trade Secrets ”); and (iii) all copyrights, copyright registrations and applications therefor and all other rights corresponding thereto throughout the world (“ Copyrights ”).\n\xa0\n(d) “Prior Grantee ” means those third parties who have been granted a license to, or otherwise have a right, to the Transferred Software.\n\xa0\n(e) “Software ” means any and all computer software, documentation and code, including assemblers, applets, compilers, source code, source code listings, object code, data\n(including image and sound data), design tools and user interfaces, in any form or format, however fixed.\n\xa0\n(f) “Third Party Software ” means any Software that is a component of, or necessary to compile, the Transferred Software and that is either not owned by Avatech to which Avatech is\nrestricted from transferring ownership to Autodesk.\n\xa0\n(g) “Transferred Intellectual Property Rights ” means all Intellectual Property Rights in and to the Tr

### create the DB


In [35]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)


Using embedded DuckDB with persistence: data will be stored in: db


In [36]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [37]:

# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

Using embedded DuckDB with persistence: data will be stored in: db


In [38]:
retriever = vectordb.as_retriever()


In [39]:
docs = retriever.get_relevant_documents("What is Avatech?")


In [40]:
len(docs)


4

In [41]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})


In [42]:
retriever.search_type


'similarity'

In [43]:
retriever.search_kwargs


{'k': 2}

In [None]:
### Make a chain

In [44]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [45]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [49]:

# full example
query = "Who is Franck Baden?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Franck Baden is a notary from Luxembourg.


Sources:
Multi_doc_legal_examples_txt/Sostatus_ses_utf8.txt
Multi_doc_legal_examples_txt/Sostatus_ses_utf8.txt


In [50]:
query = "What is Avatech?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Avatech is a Delaware corporation with offices at 10715 Red Run Blvd., Suite 101, Owings Mills, Maryland 21117 USA.


Sources:
Multi_doc_legal_examples_txt/Software Transfer Agreement_utf8.txt
Multi_doc_legal_examples_txt/Software Transfer Agreement_utf8.txt


In [53]:
# break it down
query = "What is Avatech's legal document about?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response
     

{'query': "What is Avatech's legal document about?",
 'result': " Avatech's legal document is about transferring software rights and designating and appointing Autodesk and its duly authorized officers and agents as its agents and attorneys-in-fact to act on its behalf.",
 'source_documents': [Document(page_content='documents. Avatech will also assist Autodesk in filing and prosecuting United States and foreign patent applications claiming the Transferred Intellectual Property Rights at the Autodesk’ s expense.\n\xa0\n2.3 Exclusive Ownership . Without limiting the foregoing, Autodesk will have the exclusive right to commercialize, prepare and sell products based upon, license, sublicense, prepare\nderivative works from, and otherwise use and exploit the Transferred Software and Transferred Intellectual Property Rights. Avatech hereby waives any and all moral rights, including any right to\nidentification of authorship or limitation on subsequent modification, that Avatech (or its emplo

In [None]:
# Thanks to https://www.youtube.com/watch?v=3yPBVii7Ct0&t=5s&ab_channel=SamWitteveen