In [2]:
import re
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss
import torch
import os

device = 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [3]:
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-small",
    model_kwargs={'device': device},
    encode_kwargs={'normalize_embeddings': True}
)

FAISS_INDEX_PATH = "faiss_index"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [4]:
def preprocess_and_save_faiss(file_path="georgian-civil-code.pdf", is_pdf=True):
    if os.path.exists(FAISS_INDEX_PATH):
        print("FAISS index already exists. Skipping preprocessing.")
        return

    if is_pdf:
        loader = PyMuPDFLoader(file_path)
    else:
        from langchain_community.document_loaders import TextLoader
        loader = TextLoader(file_path, encoding='utf-8')
    docs = loader.load()

    all_splits = []
    for doc in docs:
        articles = re.split(r'(მუხლი \d+)', doc.page_content)[1:]
        for i in range(0, len(articles), 2):
            article_title = articles[i].strip()
            article_text = articles[i+1].strip()

            article_text = re.sub(r'\s+', ' ', article_text)
            article_text = article_text.replace(' .', '.').replace(' ,', ',')
            article_text = re.sub(r'(\w+)-(\w+)', r'\1\2', article_text)

            article_number = re.search(r'\d+', article_title).group(0) if re.search(r'\d+', article_title) else 'unknown'
            all_splits.append(Document(
                page_content=article_text,
                metadata={"article": article_number}
            ))

    embedding_dim = len(embeddings.embed_query("საქართველოს სამოქალაქო კოდექსის RAG აგენტი"))
    index = faiss.IndexFlatL2(embedding_dim)
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )
    vector_store.add_documents(all_splits)

    vector_store.save_local(FAISS_INDEX_PATH)
    print(f"FAISS index saved to {FAISS_INDEX_PATH}")

In [5]:
preprocess_and_save_faiss("georgian-civil-code.pdf")

FAISS index saved to faiss_index
