## first you need to run below markdown as python code to download and save the roberta model for QnA part of the app which is approximatly 500mb

from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_name = "deepset/roberta-base-squad2"

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.save_pretrained('./roberta-base-squad2')
tokenizer.save_pretrained('./roberta-base-squad2')

## Then we can create vectors for docs in ./Rag_docs and save them using following steps

In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
import pickle

In [2]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./Rag_docs/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700, # adjust as you need to according to sentences lenghts or best performance
    chunk_overlap  = 50, # adjust as you need to according to sentences lenghts or best performance
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


Document(metadata={'source': 'Rag_docs\\Access Control.pdf', 'page': 0}, page_content='Access \nControl')

In [3]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 36 documents loaded, with average characters equal to 327.
After split, there were 36 documents (chunks), with average characters equal to 327 (average chunk length).


In [4]:
# Initialize HuggingFaceBgeEmbeddings
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device': 'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

# Assuming docs_after_split is a list of Document objects, extract the text content
documents_text = [doc.page_content for doc in docs_after_split]  # Extracting text from Document objects

# Vectorize the documents
document_embeddings = huggingface_embeddings.embed_documents(documents_text)

# Save embeddings and documents to a file
save_path = "document_embeddings.pkl"
with open(save_path, "wb") as f:
    pickle.dump({"documents": documents_text, "embeddings": document_embeddings}, f)

print(f"Embeddings saved to {save_path}")


Embeddings saved to document_embeddings.pkl
