In [66]:
# %pip install -Uq \
# langchain langchain-community langchain-huggingface langchain-chroma \
# pypdf transformers accelerate Xformers InstructorEmbedding \
# sentencepiece bitsandbytes tiktoken chromadb typer semantic_split \
# cryptography

# Document Pre-Processing

- Importing PDFs
- Parsing PDFs
- Splitting into chunks

> ❗️  
> Tested using one file only. Have to find a way to work with multiple files.

In [67]:
import os

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter

In [68]:
path = "/content/drive/MyDrive/ncvs_documents/"
# loader = DirectoryLoader(path=path,
#                          glob="*.pdf",
#                          loader_cls=PyPDFLoader)
loader = PyPDFLoader(file_path="./assets/ncvs_documents/ncvs_documents/CHAPTER-4_LIFE_SAVING_APPLIANCES.pdf")

documents = loader.load()
len(documents)

39

The `documents` is specifically an instance of `List[Document]` class. It consists of the tuple: `page_content` and `metadata` as shown below.

In [69]:
documents[13]

Document(page_content='IV - 14Chapter IV Live Saving Appliances Bab IV Perlengkapan Keselamatan NCVS Indonesia\nto\tthe\tspecified\t load\tline\tin\torder\tto\tmeasure\tthe\t\nfreeboard.\n6) The freshwater freeboard for the life boat with a \nstandard sheer shall be at least 3% of the length of the boat, but shall not be less than 7; 9.5;13 and 16.5 cm respectively for the life boats with depths of 31, 46, 61 and 76 cms.   \n Section 3  DA VITS AND LAUNCHING ARRANGEMENTS  \n    FOR  LIFE BOATS\n3.1. Type of davits3.1.1. Davits for life boats shall be:1)\t Of\t a\tluffing\tgravity\tof\tmechanically\t controlled\t\nsingle arm type, where the mass of the fully equipped life boat, manned and ready for launch -\ning, is less than 2.3 tonnes; or\n2) of gravity type, where such mass exceeds 2.3 \ntonnes.\n3) The mass of a life boat when fully loaded with persons and equipment shall not exceed 20.3 tonnes, a person being deemed for the purposes of this provision to be 75 kilograms.\n3.1.2. Str

Clean the multiple and/or trailing whitespaces.

> Known Issues:
> - Weird result when removing non-word characters. Consider _not_ removing those.

In [70]:
import re

for i in range(len(documents)):
  cleaned_docs = re.sub("\s\W\s", " ", documents[i].page_content)  # remove weird dashes
  cleaned_docs = re.sub("\s+", " ", cleaned_docs)    # remove trailing spaces
  documents[i].page_content = cleaned_docs

In [71]:
documents[13].page_content

'IV 14Chapter IV Live Saving Appliances Bab IV Perlengkapan Keselamatan NCVS Indonesia to the specified load line in order to measure the freeboard. 6) The freshwater freeboard for the life boat with a standard sheer shall be at least 3% of the length of the boat, but shall not be less than 7; 9.5;13 and 16.5 cm respectively for the life boats with depths of 31, 46, 61 and 76 cms. Section 3 DA VITS AND LAUNCHING ARRANGEMENTS FOR LIFE BOATS 3.1. Type of davits3.1.1. Davits for life boats shall be:1) Of a luffing gravity of mechanically controlled single arm type, where the mass of the fully equipped life boat, manned and ready for launch ing, is less than 2.3 tonnes; or 2) of gravity type, where such mass exceeds 2.3 tonnes. 3) The mass of a life boat when fully loaded with persons and equipment shall not exceed 20.3 tonnes, a person being deemed for the purposes of this provision to be 75 kilograms. 3.1.2. Strength1) Every davit serving a life boat which is required to be put into the 

## Splitting into Chunks

There are two such splitters that are recommended here, one is the `RecursiveCharacterTextSplitter`from Langchain and the other is the [`semantic-split`](https://github.com/agamm/semantic-split) by Agamm.

> Known issues:  
> `semantic-split` uses string as arguments, meaning we _could_ very well lose page metadata of the content itself. Workaround is in progress.

In [72]:
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=512,
                                                    chunk_overlap=64,
                                                    separators=["\n\n",
                                                                "\n",
                                                                " ",
                                                                ".",
                                                                ",",
                                                                "\u200b",  # Zero-width space
                                                                "\uff0c",  # Fullwidth comma
                                                                "\u3001",  # Ideographic comma
                                                                "\uff0e",  # Fullwidth full stop
                                                                "\u3002",  # Ideographic full stop
                                                                ""])

semantic_splitter = SimilarSentenceSplitter(similarity_model=SentenceTransformersSimilarity(),
                                           sentence_splitter=SpacySentenceSplitter())



In [73]:
recursive_text = recursive_splitter.split_documents(documents)

semantic_text= list()
for i in range(len(documents)):
  semantic_text.append(semantic_splitter.split(documents[i].page_content))

print(f"Recursive: {len(recursive_text)}")
print(f"Semantic: {len(semantic_text)}")

Recursive: 499
Semantic: 39


In [74]:
recursive_text[250]

Document(page_content='5.1.2. A dinghy shall be of a highly visible color. 5.1.3. The dinghy shall be fitted with grab-lines, which are ef- fective with the dinghy upright or upturned and shall not be secured more than 150 mm or less than 100 mm below the gunwale. 5.1.4. The securing points shall be spaced at not more than 460 mm or less than 300 mm centers and interlaced to prevent movement. The depth of the loop when at right angles to the vessel shall be not more than 200 mm or less than 150 mm. 5.2. Equipment A dinghy shall', metadata={'source': './assets/ncvs_documents/ncvs_documents/CHAPTER-4_LIFE_SAVING_APPLIANCES.pdf', 'page': 18})

In [75]:
semantic_text

[[['Chapter IV Live Saving Appliances Bab IV Perlengkapan Keselamatan NCVS Indonesia IV 1REPUBLIK INDONESIA KEMENTERIAN PERHUBUNGAN STANDAR KAPAL NON-KONVENSI BERBENDERA INDONESIA BAB IV BAB IV PERLENGKAPAN KESELAMATANREPUBLIK INDONESIA MINISTRY OF TRANSPORTATION NON-CONVENTION VESSEL STANDARD INDONESIAN FLAGGED CHAPTER IV CHAPTER IV LIFE-SA VING APPLIANCES Copyright 2010 Ministry of Transportation, Republic of Indonesia Hak cipta 2010 Kementerian Perhubungan, Republik Indonesia PERLENGKAPAN KESELAMATAN LIFE SAVING APPLIANCES STANDAR KAPAL NON-KONVENSI BERBENDERA INDONESIA NON-CONVENTION VESSEL STANDARD INDONESIAN FLAGGEDREPUBLIK',
   'INDONESIA KEMENTERIAN PERHUBUNGAN MINISTRY OF TRANSPORTATION BAB CHAPTER IV Hak cipta ©2009 Kementerian Perhubungan, Republik Indonesia Edisi Pertama 2009']],
 [['IV 2Chapter IV Live Saving Appliances',
   'Bab IV Perlengkapan Keselamatan NCVS Indonesia TABLE OF CONTENT CHAPTER IV LIFE-SA VING APPLIANCES...................................................

# ChromaDB Collections

Text chunks processed will be passed through an embedding model and saved into
a ChromaDB database (collection).

In [76]:
from os import walk

mypath = "./assets/ncvs_documents/ncvs_documents/"
filenames = next(walk(mypath), (None, None, []))[2]  # [] if no file
filenames.sort()
filenames

['CHAPTER-1_INTRODUCTION.pdf',
 'CHAPTER-2_CONSTRUCTION.pdf',
 'CHAPTER-3_EQUIPMENT.pdf',
 'CHAPTER-4_LIFE_SAVING_APPLIANCES.pdf',
 'CHAPTER-5_MACHINERY_AND_ELECTRICAL.pdf',
 'CHAPTER-6_LOAD_LINES.pdf',
 'CHAPTER-7_TONNAGE_MEASUREMENT.pdf',
 'CHAPTER-8_MANNING.pdf',
 'CHAPTER-9_MANAGEMENT_OPERATIONAL.pdf',
 'SK_Dirjen_Hubla_No._UM.008-9-20-DJPL_-_2012_1708919307382_0.pdf']

In [77]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer

dir = "db"
client = chromadb.PersistentClient(path=dir)

class MyEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = SentenceTransformer('BAAI/bge-m3', device='cuda')

    def __call__(self, input: Documents) -> Embeddings:
        # embed the documents
        sentences = input
        embeddings = self.model.encode(sentences)
        return embeddings

# embedding_function = MyEmbeddingFunction()
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    device='cuda', 
    model_name="BAAI/bge-m3"
)

recursive_collection = client.get_or_create_collection(name="ncvs-recursive",
                                               embedding_function=embedding_function)

semantic_collection = client.get_or_create_collection(name="ncvs-semantic",
                                                        embedding_function=embedding_function)


In [78]:
# If something went wrong, remove the collections
# client.delete_collection(name="ncvs-recursive")
# client.delete_collection(name="ncvs-semantic")

## With text from `RecursiveCharacterTextSplitter`

⚠️ **ONLY DO THIS ONCE**

In [79]:
recursive_collection.add(
    documents=[s.page_content for s in recursive_text],
    metadatas=[s.metadata for s in recursive_text],
    ids=["CH4_"+str(i) for i in range(1, len(recursive_text)+1)]
)

In [80]:
result = recursive_collection.query(
    query_texts=["dinghy requirements"], 
    n_results=3
)

import pprint
pprint.pprint(result.get("documents"))

[['5.1.2. A dinghy shall be of a highly visible color. 5.1.3. The dinghy shall '
  'be fitted with grab-lines, which are ef- fective with the dinghy upright or '
  'upturned and shall not be secured more than 150 mm or less than 100 mm '
  'below the gunwale. 5.1.4. The securing points shall be spaced at not more '
  'than 460 mm or less than 300 mm centers and interlaced to prevent movement. '
  'The depth of the loop when at right angles to the vessel shall be not more '
  'than 200 mm or less than 150 mm. 5.2. Equipment A dinghy shall',
  'than 200 mm or less than 150 mm. 5.2. Equipment A dinghy shall be equipped '
  'with the following :5.2.1. Two oars and rowlocks, or paddles 5.2.2. one '
  'painter attached to bow5.2.3. one bucket or bailer, attached by '
  'lanyard5.2.4. one bung permanently attached by lanyard 5.3. Marking of a '
  'dinghy Retro-reflective tapes shall comply and be fitted in accordance with '
  'the requirements of the Authority. 5.4. Persons capacity The perso

## With Text From `SemanticSplitter`

⚠️ **ONLY DO THIS ONCE**

> Known issues: 
> - The process should be as agamm explained in the repo himself (Section *Why?*)
> - Embeddings should be done with a pair of encoder and decoder. 
>


In [81]:
model = SentenceTransformer("BAAI/bge-m3", device="cuda")
flat_semantic_text = [item for sublist in semantic_text for subsublist in sublist for item in subsublist]
semantic_embeddings = model.encode(flat_semantic_text)

In [83]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModel, GenerationConfig, BitsAndBytesConfig, pipeline

access_token = "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"
model_name = "mistralai/Mistral-7B-v0.3"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token=access_token)
model = AutoModel.from_pretrained(model_name, 
                                  quantization_config=quantization_config,
                                  torch_dtype=torch.float16, 
                                  device_map="auto", 
                                  low_cpu_mem_usage=True, 
                                  token=access_token)



OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-v0.3.
403 Client Error. (Request ID: Root=1-66598296-21efd01b0f544d495e1f48c4;b4591e09-64d0-4c5a-b141-d6921d5ab88c)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.3 is restricted and you are not in the authorized list. Visit https://huggingface.co/mistralai/Mistral-7B-v0.3 to ask for access.

In [None]:
semantic_collection.add(
    embeddings=semantic_embeddings,
    # documents=flat_semantic_text, 
    ids=["CH_4"+str(s) for s in range(1, len(semantic_embeddings)+1)],
    metadatas=[{"source": "Chapter 4 - Life Saving Appliances"}
                for i in range(1, len(semantic_embeddings)+1)]
)

In [None]:
out = semantic_collection.query(
    query_texts=["dinghy"],
    n_results=10
)

for i, doc in enumerate(out.get("documents")): 
    print(i, ". ", doc)

print(out)