## Install the Aryn SDK
Make sure the version is at least 0.2.1.

In [None]:
!pip install aryn-sdk

Place your Aryn API key in ${home}/.aryn/config.yaml in the following format.

aryn_token: < your Aryn API key >

## Set up the client

In [8]:
from aryn_sdk.client.client import Client

# Instantiate a new client
client = Client()

In [27]:
from aryn_sdk.client.exceptions import ArynSDKException
from aryn_sdk.client.client import Client
from aryn_sdk.types.docset import DocSetMetadata, DocSetUpdate 
from aryn_sdk.types.document import DocumentMetadata, ReplaceOperation, FieldUpdates 
from aryn_sdk.types.schema import SchemaField, Schema 
from aryn_sdk.types.search import SearchRequest, SearchResponse

In [9]:
res = client.list_docsets()

In [None]:
for page in res.iter_page():
    for docset in page.value:
        print(f"{docset.docset_id}: {docset.name}")

# If this is your first time trying out Aryn, the output should be empty.
# If you have been using DocParse to parse documents, you may see a DocSet called "docparse_storage"

In [12]:
# If you have a DocSet, try listing documents that belong to that DocSet
default_docset_id = ""
docs = client.list_docs(docset_id=default_docset_id)

In [None]:
for page in docs.iter_page():
    for doc in page.value:
        print(f"{doc.doc_id}: {doc.name}")

doc1_id = ""

In [133]:
# Pick a document and inspect what's in it.
doc1 = client.get_doc(docset_id=default_docset_id, doc_id=doc1_id)

In [None]:
print(doc1.value.model_dump())

# Jupyter might complain about data rate being exceeded.  If that happens, just have a peek.
print(str(doc1.value.model_dump())[0:1000])

## Add a new document to DocParse Storage using DocParse

In [None]:
from aryn_sdk.partition import partition_file

# If you already have some file sitting on your computer that you want to add to storage.
local_path = ""
partition_file(file=local_path)

In [115]:
# Or you can download some sample PDFs from our public S3 bucket.
import boto3

session = boto3.Session()
s3_client = session.client("s3")
local_path = ""
s3_client.download_file("aryn-public", "ntsb/0.pdf", local_path)

# Download and parse several NTSB documents
partition_file(file=local_path)

## Try out the Search API

API documentation: https://docs.aryn.ai/docparse/storage#searching-stored-documents

In [140]:
query = SearchRequest(query="ntsb", query_type="vector", return_type="element")

res = client.search(docset_id=default_docset_id, query=query)

In [None]:
for doc in res.value.results[0:10]:
    if "_original_element" in doc["properties"]:
        doc['properties'].pop('_original_elements')
    print(f"{doc['doc_id']}: {doc['properties']['entity']}: {doc['text_representation']}")

In [143]:
query = SearchRequest(query="accident", properties_filter='(properties.entity.title like "report")')

res = client.search(docset_id=default_docset_id, query=query)

In [None]:
for doc in res.value.results[0:10]:
    if "_original_element" in doc["properties"]:
        doc['properties'].pop('_original_elements')
    print(f"{doc['doc_id']}: {doc['properties']['entity']}: {doc['text_representation']}")

## Extract properties

Try extracting properties from the documents you have ingested and perform searches using extracted properties as filters.

In [29]:
default_docset_id = ""

field = SchemaField(name="title", field_type="string", description="title of the document")
schema = Schema(fields=[field])

res = client.extract_properties(docset_id=default_docset_id, schema=schema)

In [None]:
for doc in res.value.results[0:10]:
    print(f"{doc['doc_id']}: {doc['properties']}")

In [129]:
field = SchemaField(name="aircraft_type", field_type="string", description="Aircraft type")
schema = Schema(fields=[field])

res = client.extract_properties(docset_id=default_docset_id, schema=schema)

In [31]:
# Make sure you get 'exit_status' of 0 (success).
print(res.value)

exit_status=0


In [148]:
query = SearchRequest(query="accident", properties_filter='(properties.entity.aircraft_type like "VAN\'S")')

res = client.search(docset_id=default_docset_id, query=query)

In [None]:
for doc in res.value.results[0:10]:
    if "_original_element" in doc["properties"]:
        doc['properties'].pop('_original_elements')
    print(f"{doc['doc_id']}: {doc['properties']['entity']}: {doc['text_representation']}")