In [51]:

import warnings
warnings.filterwarnings('ignore')

In [52]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [53]:
import json
from IPython.display import JSON

import requests

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.staging.base import dict_to_elements

import chromadb

In [54]:
from Utils import Utils
utils = Utils()

DLAI_API_KEY = utils.get_dlai_api_key()
DLAI_API_URL = utils.get_dlai_url()

s = UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,
)

In [55]:
# RUN THE DOCUMENT THROUGH THE UNSTRUCTURED API

filename = "example_file/pytorch.pdf"

with open(filename, "rb") as f:
    files = shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(files=files)


In [56]:
try:
    resp = s.general.partition(req)
except SDKError as e:
    print(e)

API error occurred: Status 401
{"detail":"API key is missing, please provide an API key in the header."}


In [None]:
JSON(json.dumps(resp.elements[0:5], indent=2))

In [12]:
#Find elements associated with chapters

chapters = [
    "Tensors",
    "Autograd",
    "Modular structure",
    "Visualization Tools like",
    "Various other functions",
]

In [None]:
chapter_ids = {}
for element in resp.elements:
    for chapter in chapters:
        if element["text"] == chapter and element["type"] == "Title":
            chapter_ids[element["element_id"]] = chapter
            break

In [None]:
chapter_to_id = {v: k for k, v in chapter_ids.items()}
[x for x in resp.elements if x["metadata"].get("parent_id") == chapter_to_id["Autograd"]][0]

In [58]:
# Load documents into a vector db
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

True

In [47]:
collection = client.create_collection(
    name="pytorch",
    metadata={"hnsw:space": "cosine"}
)
#cot_paper

In [None]:
for element in resp.elements:
    parent_id = element["metadata"].get("parent_id")
    chapter = chapter_ids.get(parent_id, "")
    collection.add(
        documents=[element["text"]],
        ids=[element["element_id"]],
        metadatas=[{"chapter": chapter}]
    )

In [None]:
result = collection.query(
    query_texts=["Show to me how do load data, devices and CUDA?"],
    n_results=2,
    where={"chapter": "Tensors"},
)
print(json.dumps(result, indent=2))

In [None]:
# Chunking Content
elements = dict_to_elements(resp.elements)

In [None]:
chunks = chunk_by_title(
    elements,
    combine_text_under_n_chars=100,
    max_characters=3000,
)


In [None]:
JSON(json.dumps(chunks[0].to_dict(), indent=2))