In [None]:
from pathlib import Path
import datetime
import math
import numpy
import os
import requests
import stat
import sys
import time

import sycamore
from sycamore.functions import HuggingFaceTokenizer, TextOverlapChunker
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import UnstructuredPdfPartitioner, HtmlPartitioner

index = "demoindex0"

In [None]:
%%bash
pwd
sudo mkdir -p /app/work/crawl_data/downloads/pdf
sudo mkdir -p /app/work/crawl_data/downloads/html
sudo curl https://sortbenchmark.org/ELSAR2022.pdf -o /app/work/crawl_data/downloads/pdf/elsar.pdf
sudo curl https://sortbenchmark.org -o /app/work/crawl_data/downloads/html/home.html

In [None]:
def get_os_client_args():
    args = {
        "hosts": [{"host": "opensearch", "port": 9200}],
        "http_compress": True,
        "http_auth": ("admin", "admin"),
        "use_ssl": True,
        "verify_certs": False,
        "ssl_assert_hostname": False,
        "ssl_show_warn": False,
        "timeout": 120,
    }
    return args

def get_index_settings():
    return {
        "body": {
            "settings": {"index.knn": True, "number_of_shards": 5, "number_of_replicas": 1},
            "mappings": {
                "properties": {
                    "text": {"type": "text"},
                    "embedding": {
                        "dimension": 384,
                        "method": {"engine": "nmslib", "space_type": "l2", "name": "hnsw", "parameters": {}},
                        "type": "knn_vector",
                    },
                    "title": {"type": "text"},
                    "searchable_text": {"type": "text"},
                    "title_embedding": {
                        "dimension": 384,
                        "method": {"engine": "nmslib", "space_type": "l2", "name": "hnsw", "parameters": {}},
                        "type": "knn_vector",
                    },
                    "url": {"type": "text"},
                }
            },
        }
    }

In [None]:
def get_title_context_template():
    # ruff: noqa: E501
    return """
        ELEMENT 1: Jupiter's Moons
        ELEMENT 2: Ganymede 2020
        ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011
        ELEMENT 4: From Wikipedia, the free encyclopedia
        ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.
        =========
        "Ganymede 2020"

        ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation
        ELEMENT 2: Tarun Kalluri * UCSD
        ELEMENT 3: Deepak Pathak CMU
        ELEMENT 4: Manmohan Chandraker UCSD
        ELEMENT 5: Du Tran Facebook AI
        ELEMENT 6: https://tarun005.github.io/FLAVR/
        ELEMENT 7: 2 2 0 2
        ELEMENT 8: b e F 4 2
        ELEMENT 9: ]
        ELEMENT 10: V C . s c [
        ========
        "FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation"

        """

def get_author_context_template():
    # ruff: noqa: E501
    return """
            ELEMENT 1: Jupiter's Moons
            ELEMENT 2: Ganymede 2020
            ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011
            ELEMENT 4: From Wikipedia, the free encyclopedia
            ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.
            =========
            Audi Laupe, Serena K. Goldberg

            ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation
            ELEMENT 2: Tarun Kalluri * UCSD
            ELEMENT 3: Deepak Pathak CMU
            ELEMENT 4: Manmohan Chandraker UCSD
            ELEMENT 5: Du Tran Facebook AI
            ELEMENT 6: https://tarun005.github.io/FLAVR/
            ELEMENT 7: 2 2 0 2
            ELEMENT 8: b e F 4 2
            ELEMENT 9: ]
            ELEMENT 10: V C . s c [
            ========
            Tarun Kalluri, Deepak Pathak, Manmohan Chandraker, Du Tran

            """

In [None]:
def import_pdf(paths):
    if len(paths) == 0:
        print("WARNING: import_pdf called with empty paths")
        return

    openai_llm = OpenAI(OpenAIModels.GPT_3_5_TURBO_INSTRUCT.value)
    tokenizer = HuggingFaceTokenizer("sentence-transformers/all-MiniLM-L6-v2")
    merger = GreedyTextElementMerger(tokenizer, 256)

    ctx = sycamore.init()
    (
        ctx.read.binary(paths, binary_format="pdf", filter_paths_by_extension=False)
        .partition(
            partitioner=UnstructuredPdfPartitioner(),
        )
        .merge(merger)
        .extract_entity(
            entity_extractor=OpenAIEntityExtractor(
                "title", llm=openai_llm, prompt_template=get_title_context_template()
            )
        )
        .extract_entity(
            entity_extractor=OpenAIEntityExtractor(
                "authors", llm=openai_llm, prompt_template=get_author_context_template()
            )
        )
        .spread_properties(["path", "title"])
        .explode()
        .embed(
            embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2")
        )
        .write.opensearch(os_client_args=get_os_client_args(), index_name=index, index_settings=get_index_settings())
    )

In [None]:
def import_html(paths):
    if len(paths) == 0:
        print("WARNING: import_html called with empty paths")
        return

    ctx = sycamore.init()
    (
        ctx.read.binary(paths, binary_format="html", filter_paths_by_extension=False)
        .partition(
            partitioner=HtmlPartitioner(
                extract_tables=True,
                text_chunker=TextOverlapChunker(chunk_token_count=4000, chunk_overlap_token_count=400),
            )
        )
        .spread_properties(["path", "title"])
        .explode()
        .embed(
            embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2")
        )
        .write.opensearch(os_client_args=get_os_client_args(), index_name=index, index_settings=get_index_settings())
    )

In [None]:
import_pdf("/app/work/crawl_data/downloads/pdf")
import_html("/app/work/crawl_data/downloads/html")

In [None]:
print("Visit http://localhost:3000 and use the", index, " index to query these results in the UI")