In [1]:
# %pip install -Uq \
# langchain langchain-community langchain-huggingface langchain-chroma \
# pypdf transformers accelerate Xformers InstructorEmbedding \
# sentencepiece bitsandbytes tiktoken chromadb typer semantic_split \
# cryptography

# Document Pre-Processing

- Importing PDFs
- Parsing PDFs
- Splitting into chunks

In [2]:
import os

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

In [3]:
path = "../assets/ncvs_documents/"
loader = DirectoryLoader(path=path,
                         glob="*.pdf",
                         loader_cls=PyPDFLoader)
documents = loader.load()
len(documents)

626

## Clean multiple and/or trailing whitespaces.

In [4]:
import re

for i in range(len(documents)):
  cleaned_docs = re.sub("\s\W\s", " ", documents[i].page_content)  # remove weird dashes
  cleaned_docs = re.sub("\s+", " ", cleaned_docs)    # remove trailing spaces
  documents[i].page_content = cleaned_docs

## Splitting into Chunks

There are two such splitters that are recommended here, one is the `RecursiveCharacterTextSplitter`from Langchain and the other is the [`semantic-split`](https://github.com/agamm/semantic-split) by Agamm.

> Known issues:  
> `semantic-split` uses string as arguments, meaning we _could_ very well lose page metadata of the content itself. Workaround is in progress.

In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                          chunk_overlap=250,
                                          separators=["\n\n",
                                                      "\n",
                                                      " ",
                                                      ".",
                                                      ",",
                                                      "\u200b",  # Zero-width space
                                                      "\uff0c",  # Fullwidth comma
                                                      "\u3001",  # Ideographic comma
                                                      "\uff0e",  # Fullwidth full stop
                                                      "\u3002",  # Ideographic full stop
                                                      ""])

In [6]:
text = splitter.split_documents(documents)
print(f"Recursive: {len(text)}")

Recursive: 3833


# ChromaDB Collections

Text chunks processed will be passed through an embedding model and saved into
a ChromaDB database (collection).

In [7]:
from os import walk

mypath = "../assets/ncvs_documents/ncvs_documents/"
filenames = next(walk(mypath), (None, None, []))[2]  # [] if no file
filenames.sort()
filenames

[]

In [13]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer

dir = "db"
client = chromadb.PersistentClient(path=dir)

class MyEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = SentenceTransformer('BAAI/bge-m3', device='cuda')

    def __call__(self, input: Documents) -> Embeddings:
        # embed the documents
        sentences = input
        embeddings = self.model.encode(sentences)
        return embeddings.tolist()

embedding_function = MyEmbeddingFunction()
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
#     device='cuda', 
#     model_name="BAAI/bge-m3"
# )

collection = client.get_or_create_collection(name="ncvs-indonesia",
                                            embedding_function=embedding_function)

In [14]:
# If something went wrong, remove the collections
# client.delete_collection(name="ncvs-indonesia")

## Batching and Inserting

Create batches for the chunks and upsert into database (add if nonexistent or update if exists) 


In [20]:
from chromadb.utils.batch_utils import create_batches
batches = create_batches(api=client, 
                         ids=["NCVS{n:04}".format(n=i) for i in range(1, len(text)+1)],
                         documents=[s.page_content for s in text], 
                         metadatas=[s.metadata for s in text])

for batch in batches: 
    collection.upsert(
        ids=batch[0], 
        documents=batch[3], 
        metadatas=batch[2])

# Generation Model

In [16]:
from openai import OpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PROMPT_TEMPLATE = """
Use the following context (delimited by <ctx></ctx>), \
chat history (delimited by <hs></hs>) and source \
(delimted by <src></src>) to answer the question:
---------------------
{context}
---------------------
{history}
---------------------
{source}
---------------------
Question: {query}
"""

In [17]:
def generate_prompt_items(query: str): 
    retrieve = collection.query(
        query_texts=[query], 
        n_results=5
    )

    context = [["<ctx>" + s + "</ctx>" for s in chunk] for chunk in retrieve.get("documents")]
    context = "".join("\n\n".join(chunk) for chunk in context)

    source = [["<src>" + "Source: " + s["source"] + ", page: " + str(s["page"]) + "</src>" for s in chunk] for chunk in retrieve.get("metadatas")]
    source  = "".join("\n\n".join(chunk) for chunk in source)
    return context, source
    

In [25]:
def generate_openai_response(query): 
    context, source = generate_prompt_items(query)
    client = OpenAI(api_key=OPENAI_API_KEY)

    return client.chat.completions.create(
        model="gpt-3.5-turbo", 
        messages = [
            {
                "role":"system", 
                "content": "You will be provided a context (delimited by <ctx></ctx>) and the context source \
                            (delimited by <src></src>). Answer the question only based on the context given. \
                            Include the sources used in the answer you generated after the final paragraph, \
                            formatted with bullets for each different sources."
            },
            {
                "role": "assistant",
                "content": context
            },
            {
                "role": "assistant",
                "content": source
            },
            {
                "role": "user",
                "content": query
            }
        ],
        temperature=0
    ).choices[0].message.content

In [23]:
question = input("Ask me something about what's written in Indonesia's NCVS book: ")
print("Question: {}\n\nAnswer: {}".format(question, 
                                           generate_openai_response(question)))

Question: How many categories does a lifeboat have?

Answer: A lifeboat has four categories: Category A, Category B, Category C, and Category D. These categories have specific requirements and regulations for construction and equipment of the lifeboat. (Source: ..\assets\ncvs_documents\CHAPTER-4_LIFE_SAVING_APPLIANCES_v.4.4_1708919237619_0.pdf, page: 1)


In [24]:
question = input("Ask me something about what's written in Indonesia's NCVS book: ")
print("Question: {}\n\nAnswer: {}".format(question, 
                                           generate_openai_response(question)))

Question: Can you elaborate to me what are the differences of each life jacket categories?

Answer: The document outlines different categories of life jackets with specific requirements:

1. Category A Life Jackets:
   - Shall not burn or melt after being engulfed by fire in 2 seconds.
   - Designed for adults to wear correctly without assistance in 1 minute.
   - Must be comfortable and allow the wearer to jump from a certain height.
   - Must have buoyancy that does not decrease by more than 5% after being immersed in fresh water for 24 hours.
   - Equipped with a whistle attached by a line and a self-illuminating light when immersed in water.

2. Category B Life Jackets:
   - Includes life jackets fitted with lights and whistles.
   - Manufacturers must ensure compliance with minimum requirements before approval.
   - Must have a manually operated switch if the light is flickering, with specific flicker rates and light intensity requirements.
   - Whistle of an approved type must be

In [26]:
question = input("Ask me something about what's written in Indonesia's NCVS book: ")
print("Question: {}\n\nAnswer: {}".format(question, 
                                           generate_openai_response(question)))

Question: Explain in detail about a vessel's minimum anchor mass.

Answer: The minimum anchor mass for a vessel is determined based on its measured length. For vessels more than 24 meters in length, the minimum mass per anchor is calculated using the equipment number as specified in the regulations. On the other hand, for vessels up to 24 meters in length, the minimum mass per anchor is determined by either applying values from specific tables for vessels of that length or calculating the mass from the equipment number as per the guidelines provided.

In case the calculated anchor mass is not available, the next higher size anchor should be selected. The mass of a single anchor is determined using a formula that takes into account various factors such as the tabular mass of the anchor, speed factor, and type of anchor being used. The number of anchors required to be carried by a vessel depends on its operating profile, with the primary anchoring system always ready and the secondary sy