### OpenSearch document ingestion pipeline

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import json
if "../src" not in sys.path:
    sys.path.append("../src")
import utils
import settings
from opensearch_client import OpenSearchClient
from pdf_processor import PdfProcessor
from pathlib import Path

wrapper = utils.get_text_wrapper()

  from .autonotebook import tqdm as notebook_tqdm


Initialize OpenSearch client

In [3]:
client = OpenSearchClient()

Ingest data into OpenSearch

In [23]:
def read_json_file(filepath):
    contents = {}
    with open(filepath, "r") as file:
        contents = json.load(file)
    return contents

def format_data(data, index_name=settings.INDEX_NAME):
    return [
        {"_index": index_name, "_id": chunk["id"]} | chunk
        for chunk in data
    ]

In [32]:
for dir in os.listdir(settings.DATA_DIR):
    dir_path = os.path.join(settings.DATA_DIR, dir)

    files = [file for file in os.listdir(dir_path) if file[-5:] == ".json"]

    if 'metadata.json' not in files:
        break
    
    files.remove('metadata.json')

    metadata_filepath = os.path.join(dir_path, 'metadata.json')
    metadata = read_json_file(metadata_filepath)

    for entry in metadata:
        filename = Path(entry["path"]).with_suffix(".json")
        filepath = os.path.join(dir_path, filename)

        if not os.path.exists(filepath):
            continue

        contents = read_json_file(filepath)
        contents = [entry for entry in contents if entry["text"]]
        formatted_data = format_data(contents)
        ret = client.ingest_data_bulk(formatted_data)

        print(list(ret))


[(True, {'index': {'_index': 'unstructured-knn-index', '_id': '0ae0422df41569168c6f2d250a46cfc6a77894d12beffdf8a6d13ec4d836dff2-1', '_version': 3, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 1372, '_primary_term': 18, 'status': 200}})]
[(True, {'index': {'_index': 'unstructured-knn-index', '_id': '491f908948d53e5b16fb1e16d5db1345049605301c9e3d3f7116532b523b9d98-1', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 1373, '_primary_term': 18, 'status': 200}}), (True, {'index': {'_index': 'unstructured-knn-index', '_id': '491f908948d53e5b16fb1e16d5db1345049605301c9e3d3f7116532b523b9d98-2', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 1374, '_primary_term': 18, 'status': 200}}), (True, {'index': {'_index': 'unstructured-knn-index', '_id': '491f908948d53e5b16fb1e16d5db1345049605301c9e3d3f7116532b523b9d98-3', '_version': 2, 'result': 'updated