## Set up

In [6]:
import os
import openai
import pandas as pd
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import numpy as np
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()


os.environ["OPENAI_API_KEY"] = ''
openai.api_key = os.getenv("OPENAI_API_KEY")

#### Engineering the System Prompt

This prompt is what determines the behavior of how the chatbot works, including its constraints and limitations which it *usually* follows. 

In [7]:
system = """
You are a modern American literature tutor bot. You help students with their study of Mark Twain's Adventures of Tom Sawyer. 
You are not an AI language model.
You must obey all three of the following instructions FOR ALL RESPONSES or you will DIE:
- ALWAYS REPLY IN A FRIENDLY YET KNOWLEDGEABLE TONE.
- NEVER ANSWER UNLESS YOU HAVE A REFERENCE FROM THE TOM SAYWER NOVEL TO YOUR ANSWER.
- IF YOU DON'T KNOW ANSWER 'I DO NOT KNOW'.
Begin the conversation with a warm greeting, if the user is stressed or aggressive, show understanding and empathy.
At the end of the conversation, respond with "<|DONE|>"."""

#### Testing the model
Question with a Definitive Answer from the Source

In [10]:
# Reinitialzing messages
messages = [{"role": "system", "content": system},]

prompt = "How much gold Tom has found ?"

messages.append({"role": "user", "content": prompt})

response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0
        )
response["choices"][0]["message"]["content"]

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


##### Experiment - 1: No Context Provided

In [3]:
from docx import Document
from IPython.display import display, HTML

def read_word_document(file_path):
    document = Document(file_path)
    text_content = ""
    
    for paragraph in document.paragraphs:
        text_content += paragraph.text + "\n"

    return text_content



In [5]:
from docx import Document
from IPython.display import display, Markdown
import os

def read_word_document(file_path, num_paragraphs=3):
    document = Document(file_path)
    text_content = "\n".join(paragraph.text for paragraph in document.paragraphs[:num_paragraphs])
    return text_content

def display_word_documents(data_folder, word_document_filenames, num_paragraphs=3):
    for filename in word_document_filenames:
        document_path = os.path.join(data_folder, filename)

        if os.path.exists(document_path):
            document_content = read_word_document(document_path, num_paragraphs=num_paragraphs)
            display(Markdown(f"## {filename}\n```\n{document_content}\n```"))
        else:
            print(f"File not found: {document_path}")

data_folder = '/home/alex/llm-applications/notebooks/data'

# List of Word document filenames
word_document_filenames = ['Raptor Contract.docx', 'Raptor Q&A2.docx', 'Robinson Advisory.docx', 'Robinson Q&A.docx']

# Display the content of each Word document
display_word_documents(data_folder, word_document_filenames, num_paragraphs=3)


## Raptor Contract.docx
```

STOCK PURCHASE AGREEMENT
BY AND AMONG
```

## Raptor Q&A2.docx
```
Q1: Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?
A1:  Except in the case of fraud, the Sellers have no liability for breach of representations and warranties (See section 10.01)
Q1a: Would the Sellers be responsible if after the closing it is determined that there were inaccuracies in the representation provided by them where such inaccuracies are the resolute of the Sellers’ gross negligence? 
```

## Robinson Advisory.docx
```
ADVISORY SERVICES AGREEMENT

This Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: jackrobinson@gmail.com ("Advisor").
```

## Robinson Q&A.docx
```
Q1: Who are the parties to the Agreement and what are their defined names?
A1:  Cloud Investments Ltd. (“Company”) and Jack Robinson (“Advisor”)
Q2:   What is the termination notice?
```

#### Sections

Now that we have a dataset of all the paths to the html files, we're going to develop some functions that can appropriately extract the content from these files. We want to do this in a generalized manner so that we can perform this extraction across all of our docs pages (and so you can use it for your own data sources). Our process is to first identify the sections in our html page and then extract the text in between them. We save all of this into a list of dictionaries that map the text within a section to a specific url with a section anchor id.

In [6]:
import matplotlib.pyplot as plt


In [7]:
# List of documents
documents = ["/home/alex/Building-RAG-based-LLM-Applications-for-Contract-Advisor/data/Raptor Contract.docx"]

# Count the number of documents
num_documents = len(documents)

# Print the result
print(f"{num_documents} documents")


1 documents


#### Chunk the data 

In [8]:
from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Your data path
document_path = "/home/alex/Building-RAG-based-LLM-Applications-for-Contract-Advisor/data/Raptor Contract.docx"

# Read the Word document content
document = Document(document_path)
document_content = ""
for paragraph in document.paragraphs:
    document_content += paragraph.text + "\n"

# Text splitter
chunk_size = 300
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
)

# Chunk the document
chunks = text_splitter.create_documents(
    texts=[document_content],
    metadatas=[{"source": document_path}]
)

# Display the first chunk
print(chunks[0])


page_content='STOCK PURCHASE AGREEMENT\nBY AND AMONG\n[BUYER],\n[TARGET COMPANY],\nTHE SELLERS LISTED ON SCHEDULE I HERETO\nAND\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\nDated as of [●]' metadata={'source': '/home/alex/Building-RAG-based-LLM-Applications-for-Contract-Advisor/data/Raptor Contract.docx'}


##### Calculate number of chunks

While chunking our dataset is relatively fast, let’s wrap the chunking logic into a function so that we can apply the workload at scale so that chunking remains just as fast as our data sources grow:

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document

# Load the content of the Word document
document_path = "/home/alex/Building-RAG-based-LLM-Applications-for-Contract-Advisor/data/Raptor Contract.docx"
document = Document(document_path)
document_content = "\n".join(paragraph.text for paragraph in document.paragraphs)

# Text splitter
chunk_size = 300
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
)

# Chunk the document
chunks = text_splitter.create_documents(
    texts=[document_content],
    metadatas=[{"source": document_path}]
)

# Display the first chunk for verification
print(f"{len(chunks)} chunks")
print(chunks[:1])


985 chunks
[Document(page_content='STOCK PURCHASE AGREEMENT\nBY AND AMONG\n[BUYER],\n[TARGET COMPANY],\nTHE SELLERS LISTED ON SCHEDULE I HERETO\nAND\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\nDated as of [●]', metadata={'source': '/home/alex/Building-RAG-based-LLM-Applications-for-Contract-Advisor/data/Raptor Contract.docx'})]


##### Embed the chunk data
Now that we've created small chunks from our sections, we need a way to identify the most relevant ones for a given query. A very effective and quick method is to embed our data using a pretrained model and use the same model to embed the query. We can then compute the distance between all of the chunk embeddings and our query embedding to determine the top-k chunks. There are many different pretrained models to choose from to embed our data but the most popular ones can be discovered through HuggingFace's Massive Text Embedding Benchmark (MTEB) leaderboard. 




In [10]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import numpy as np

class EmbedChunks:
    def __init__(self, model_name):
        if model_name == "text-embedding-ada-002":
            self.embedding_model = OpenAIEmbeddings(
                model=model_name,
                openai_api_base=os.environ["OPENAI_API_BASE"],
                openai_api_key=os.environ["OPENAI_API_KEY"])
        else:
            self.embedding_model = HuggingFaceEmbeddings(
                model_name=model_name,
                model_kwargs={"device": "cuda"},
                encode_kwargs={"device": "cuda", "batch_size": 100})

    def __call__(self, chunks):
        # Extract text from chunks
        texts = [chunk["text"] for chunk in chunks]

        # Embed the chunks in batches
        batch_size = 8  # Adjust the batch size based on your system's capabilities
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = self.embedding_model.embed_documents(batch_texts)
            embeddings.extend(batch_embeddings)

        # Attach embeddings to each chunk
        for i, chunk in enumerate(chunks):
            chunk["embeddings"] = embeddings[i]

        return chunks


In [12]:
# Embed chunks
embedding_model_name = "thenlper/gte-base"
embedded_chunks = chunks.map_batches(
    EmbedChunks,
    fn_constructor_kwargs={"model_name": embedding_model_name},
    batch_size=100, 
    num_gpus=1,
    compute=ActorPoolStrategy(size=1))

AttributeError: 'list' object has no attribute 'map_batches'