In [15]:
# !pip install tqdm



In [1]:
import re
import os
import ocrmypdf
from pathlib import Path
import pymupdf
import torch
import spacy
from tqdm.auto import tqdm

import json
import urllib.parse
import hashlib
import itertools

In [2]:
# manually trigger garbage collection to free up memory
# import gc
# gc.collect()

In [3]:
# Run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running code @ {device}')

Running code @ cuda


In [4]:
CACHE_MODELS_DIR = '../models'

DATA_DIR = "../data/"
OCR_DIR = "../data/_ocr"

os.makedirs(OCR_DIR, exist_ok=True)

#### Parse PDF, split text into chunks

Load spaCy language model for Romanian

In [5]:
# load the pre-trained spacy language model "ro_core_news_lg"
nlp = spacy.load("ro_core_news_lg")

Cleanup text

In [6]:
def cleanup_text(text: str) -> str:
    # remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # # convert to lowercase
    # text = text.lower()

    # clean up table of contents 
    text = re.sub(r'\.{4,}\s*\d+', '', text).strip()

    return text

Parse PDF

In [7]:
def parse_pdf(pdf_path: str, url: str) -> dict:
    assert os.path.exists(pdf_path), f'Path {pdf_path} not found'

    # decode percent-encoded/URL-encoded filename -> get diacritics
    filename = urllib.parse.unquote(pdf_path.split('/')[-1])

    document = pymupdf.open(pdf_path, filetype="pdf")
    data = {
        "filename": filename,
        "path": pdf_path,
        "url": url,
        "sentences": [],
    }

    for page_number, page in enumerate(document):
        text = cleanup_text(page.get_text())

        doc = nlp(text)

        data["sentences"].extend([
            {
                "page_number": page_number + 1,
                "text": str(sentence)
            }
            for sentence in doc.sents
        ])
    
    return data

In [8]:
def pdf_needs_ocr(pdf_path):
    doc = pymupdf.open(pdf_path)

    return not any([page.get_text().strip() for page in doc])


def apply_ocr(pdf_path: str) -> str:
    """
    Check if document needs OCR, use ocrmypdf.
    """

    if pdf_needs_ocr(pdf_path):
        filename = pdf_path.split("/")[-1]
        dir = os.path.join(OCR_DIR, pdf_path.split("/")[-2])
        output_pdf_path = os.path.join(dir, filename)

        os.makedirs(dir, exist_ok=True)

        if os.path.exists(output_pdf_path):
            return output_pdf_path

        ocrmypdf.ocr(
            input_file=pdf_path,
            output_file=output_pdf_path,
            output_type="pdf",
            language="ron",
            deskew=True,
            rotate_pages=True,
            progress_bar=True,
            jobs=4,
        )

        return output_pdf_path

    return pdf_path

In [9]:
def get_hash(string: str) -> int:
    return hashlib.sha256(string.encode('utf-8')).hexdigest()

def get_id(url: str, chunk_number: int) -> str:
    hash = get_hash(url)
    return f'{hash}-{chunk_number}'

In [10]:
def split_text_into_chunks(data, max_length=1024):
    paragraphs = []

    current_paragraph = []
    page_number = 1
    length = 0
    chunk_number = 1

    for sentence in data["sentences"]:
        if length + len(sentence["text"]) > max_length:
            paragraphs.append({
                "id": get_id(data["url"], chunk_number),
                "text": " ".join(current_paragraph),
                "url": data["url"],
                "type": "pdf",
                "filename": data["filename"],
                "page_number": page_number,
            })
            current_paragraph.clear()
            page_number = sentence["page_number"]
            length = 0
            chunk_number += 1
        
        current_paragraph.append(sentence["text"])
        length += len(sentence["text"])
    
    if current_paragraph:
        paragraphs.append({
            "id": get_id(data["url"], chunk_number),
            "text": " ".join(current_paragraph),
            "url": data["url"],
            "type": "pdf",
            "filename": data["filename"],
            "page_number": page_number,
        })

    return paragraphs

In [11]:
def get_directories(path: str) -> list[str]:
    return sorted([
        d.name
        for d in Path(path).iterdir()
        if d.is_dir() and d.name != OCR_DIR.split("/")[-1]
    ])


def get_files_by_extension(path: str, extension: str) -> list[str]:
    return sorted([f.name for f in Path(path).glob(f"*{extension}")])

def read_metadata(metadata_filepath: str) -> dict:
    with open(metadata_filepath, "r") as jsonFile:
        try:
            data = json.load(jsonFile)
        except:
            data = {}

    return data

## Ingest data into OpenSearch

In [12]:
from opensearchpy import OpenSearch, helpers

In [13]:
ADMIN_PASSWD = os.environ['OPENSEARCH_INITIAL_ADMIN_PASSWORD']

INDEX_NAME = 'rag-knn-index'

In [14]:
host = 'localhost'
port = 9200

# Create the client with SSL/TLS and hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = ('admin', ADMIN_PASSWD),
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

print(client.info())

{'name': 'opensearch-node1', 'cluster_name': 'opensearch-cluster', 'cluster_uuid': 'EHUZvucsS9-zBGlgZhAmCA', 'version': {'distribution': 'opensearch', 'number': '2.19.1', 'build_type': 'tar', 'build_hash': '2e4741fb45d1b150aaeeadf66d41445b23ff5982', 'build_date': '2025-02-27T01:16:47.726162386Z', 'build_snapshot': False, 'lucene_version': '9.12.1', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}


In [15]:
def process_pdfs(path=DATA_DIR) -> None:
    directories = get_directories(path)

    for directory in directories:
        pdfs = get_files_by_extension(path=os.path.join(DATA_DIR, directory), extension=".pdf")

        metadata_filepath = os.path.join(path, directory, "metadata.json")
        metadata = read_metadata(metadata_filepath)

        for filename in pdfs[:1]:
            pdf_path = os.path.join(DATA_DIR, directory, filename)
            pdf_path = apply_ocr(pdf_path)

            data = parse_pdf(pdf_path=pdf_path, url=metadata[filename]["url"])

            paragraphs = split_text_into_chunks(data)

            entries = [
                {"_index": INDEX_NAME, "_id": paragraph["id"]} | paragraph
                for paragraph in paragraphs
            ]

            # print(entries)

            # client.bulk(entries) 

            ret = helpers.parallel_bulk(
                client, 
                actions=entries, 
                chunk_size=10, 
                raise_on_error=False,
                raise_on_exception=False,
                max_chunk_bytes=20 * 1024 * 1024,
                request_timeout=60
            )

            print(list(ret))

            # for paragraph in paragraphs[:10]:
            #     print(paragraph)

process_pdfs()

[(True, {'index': {'_index': 'rag-knn-index', '_id': 'fab2ac7fe58ff17b51af9bdd7be38904e3a4f541961db7da0259d45ceebee82e-1', '_version': 4, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 77, '_primary_term': 1, 'status': 200}}), (True, {'index': {'_index': 'rag-knn-index', '_id': 'fab2ac7fe58ff17b51af9bdd7be38904e3a4f541961db7da0259d45ceebee82e-2', '_version': 4, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 79, '_primary_term': 1, 'status': 200}}), (True, {'index': {'_index': 'rag-knn-index', '_id': 'fab2ac7fe58ff17b51af9bdd7be38904e3a4f541961db7da0259d45ceebee82e-3', '_version': 4, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 81, '_primary_term': 1, 'status': 200}}), (True, {'index': {'_index': 'rag-knn-index', '_id': 'fab2ac7fe58ff17b51af9bdd7be38904e3a4f541961db7da0259d45ceebee82e-4', '_version': 4, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'f

### Search data

In [None]:
q = "atributii responsabili de proces"
query = {
  'size': 5,
  'query': {
    'multi_match': {
      'query': q,
      'fields': ['title^2', 'director']
    }
  }
}

query = {
    "size": 3,
    "query": {
      "neural": {
        "passage_embedding": {
          "query_text": "wild west",
          "model_id": "aVeif4oB5Vm0Tdw8zYO2",
          "k": 3
        }
      }
  }
}

response = client.search(
    body = query,
    index = 'python-test-index'
)



#### Convert text chunks into embeddings

In [22]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import util

In [23]:
sentenceTransformer = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device=device, cache_folder=CACHE_MODELS_DIR)

In [20]:
# for paragraph in tqdm(paragraphs):
#     paragraph["embedding"] = sentenceTransformer.encode(paragraph["text"])

100%|██████████| 136/136 [00:03<00:00, 34.40it/s]


Gather all embeddings in a list

In [21]:
# embeddings = torch.tensor([paragraph["embedding"] for paragraph in paragraphs], dtype=torch.float32).to(device)

# print(embeddings.shape)
# print(embeddings[0])

  embeddings = torch.tensor([paragraph["embedding"] for paragraph in paragraphs], dtype=torch.float32).to(device)


torch.Size([136, 384])
tensor([ 9.8783e-03,  2.9938e-02,  2.7943e-02,  1.2135e-01,  3.0623e-01,
         2.2634e-02, -1.0914e-01,  1.0262e-01,  4.0146e-02,  3.0064e-01,
         1.3308e-01, -3.7265e-02, -1.1835e-01, -8.7284e-02,  4.8206e-02,
        -2.8350e-01, -2.4382e-02, -8.7573e-03,  4.1828e-03,  3.5061e-02,
         1.8099e-01, -8.6560e-02, -9.1597e-02, -8.3297e-02, -9.6350e-02,
        -3.0929e-02, -1.8263e-01, -1.2197e-01,  6.6347e-02, -2.6500e-01,
        -1.2819e-01, -1.4886e-01,  1.5344e-02,  2.9629e-02,  3.9443e-02,
        -2.0101e-01,  2.9665e-01, -1.2701e-01,  1.1908e-01,  9.2523e-02,
        -1.1162e-01, -1.9433e-01,  2.0070e-02,  4.4440e-02, -5.3649e-02,
         6.4577e-03,  3.6158e-03, -2.7815e-01, -3.3069e-01, -1.1631e-01,
         1.8867e-01,  1.3103e-01,  6.2358e-02,  5.7047e-02,  8.3440e-02,
        -2.4264e-01,  2.2108e-01,  1.8483e-01, -1.3591e-02,  1.4099e-01,
         1.2060e-01,  1.4782e-01, -3.3231e-01,  1.0026e-01, -4.6992e-02,
        -2.2982e-02, -5.7575

In [43]:
# def retrieve_paragraphs(query: str, k: int = 5):
#     query_embedding = sentenceTransformer.encode(query, convert_to_tensor=True).to("cuda")

#     dot_scores = util.dot_score(query_embedding, embeddings)[0]
    
#     return torch.topk(dot_scores, k)

In [44]:
# def print_resources(query: str, k: int = 5):
#     scores, indices = retrieve_paragraphs(query, k)

#     # print(scores, indices)
    
#     for score, index in zip(scores, indices):
#         print(f"Score: {score:.4f}")
#         print(f"Page number: {paragraphs[index]['page_number']}")
#         print("Text:")
#         print(paragraphs[index]["text"])
#         print("\n\n")

In [30]:
# query = "obligatii ale universitatii"

# resources = retrieve_paragraphs(query)

tensor([ 2.2055e-01, -2.3237e-02, -2.1463e-01, -2.6294e-01,  1.3461e-01,
        -1.8907e-01, -3.6333e-01,  1.7251e-01,  3.3091e-01,  1.6166e-01,
        -8.1408e-02,  1.2561e-01, -3.2631e-02,  5.8751e-02,  1.2068e-01,
        -7.1358e-02, -3.3312e-01, -2.6837e-02, -7.3834e-02,  5.4398e-01,
         2.3019e-01, -3.3727e-01, -1.3698e-02, -4.6845e-02,  7.6915e-02,
         1.3901e-01,  1.1419e-01, -3.4290e-01,  1.6563e-01, -3.6166e-01,
        -1.6155e-02, -1.4223e-01,  1.1707e-01,  1.2199e-01,  1.1384e-01,
         3.2719e-01, -6.9452e-02, -6.7544e-02,  2.5511e-01,  2.6398e-01,
         5.5412e-02,  2.0600e-04,  3.1844e-01,  4.2523e-01,  3.0737e-01,
        -1.1041e-01,  1.9013e-01, -1.1189e-01, -9.0982e-02, -1.7007e-01,
         4.0972e-01,  3.7534e-02, -3.2082e-01,  4.0063e-03, -4.8830e-01,
         2.3478e-01,  3.4153e-01,  3.1695e-01, -1.4431e-01,  7.0493e-02,
        -2.1657e-02,  1.0048e-01, -1.5312e-01,  1.6839e-01, -3.2840e-01,
        -3.6350e-02,  2.9040e-01,  3.0633e-01, -3.5

In [36]:
# print(resources.values)
# print(resources.indices)

tensor([9.1122, 8.4518, 8.0900, 7.6237, 7.3855], device='cuda:0')
tensor([ 6,  7,  9, 75, 65], device='cuda:0')


In [45]:
# print_resources(query)

Score: 9.1122
Page number: 4
Text:
Principiile care guvernează învăţământul superior și pe baza cărora Universitatea își desfășoară activitatea sunt: a. principiul autonomiei universitare; UNIVERSITATEA POLITEHNICA DIN BUCUREȘTI Regulament Intern 5 b. principiul libertăţii academice; c. principiul răspunderii publice; d. principiul asigurării calităţii; e. principiul echităţii; f. principiul eficienţei manageriale şi financiare; g. principiul transparenţei; h. principiul respectării drepturilor şi libertăţilor studenţilor şi ale personalului academic; i. principiul independenţei de ideologii, religii şi doctrine politice; j. principiul libertăţii de mobilitate naţională şi internaţională a studenţilor, a cadrelor didactice şi a cercetătorilor; k. principiul consultării partenerilor sociali în luarea deciziilor; l. principiul centrării educaţiei pe student. Capitolul 1 Reguli privind protecţia, igiena şi securitatea în muncă în cadrul unităţii I. Obligațiile Universității 1.



Score: 8