In [1]:
import sys

sys.path.append("..")

In [2]:
from aurelio_sdk import AsyncAurelioClient
import os


base_url = "http://localhost:8000"
client = AsyncAurelioClient(api_key=os.environ["AURELIO_API_KEY"], base_url=base_url, debug=True)

# Extract text from PDFs and video MP4 files and urls


### From file


In [4]:
from aurelio_sdk import ExtractResponse
import aiofiles

file_path = "data/pdf/adaptive_semantic_search.pdf"

# NOTE: wait=-1 means wait till completion, polling_interval=15 means polling every 15 seconds for status till completion

response_pdf_file: ExtractResponse = await client.extract_file(
    file_path=file_path, quality="high", chunk=True, wait=-1, polling_interval=15
)

response_pdf_file

[36m[AurelioSDK] [DEBUG] -- 2024-11-06 16:20:04 - at client_async.py:173 in extract_file(): Uploading file from path, data/pdf/adaptive_semantic_search.pdf[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-06 16:20:04 - at client_async.py:184 in extract_file(): Uploading using stream[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-06 16:20:04 - at client_async.py:491 in _file_stream_generator(): Reading chunk 1, chunk_size: 41943040, total bytes: 1116509[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-06 16:20:04 - at client_async.py:494 in _file_stream_generator(): Stream finished, total chunks: 1, file size: 1.06 MB[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-06 16:20:05 - at client_async.py:400 in wait_for(): Starting polling for document completion: doc_ed4bef12-4a28-4bac-bda2-afc9518914e3[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-06 16:20:20 - at client_async.py:420 in wait_for(): Polling document doc_ed4bef12-4a28-4bac-bda2-afc9518914e3: status=TaskStatus.pending[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-0

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=23854, pages=25, seconds=None), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.high: 'high'>), document=ResponseDocument(id='doc_ed4bef12-4a28-4bac-bda2-afc9518914e3', content='3 2 0 2\nt c O 8 1 ] G L . s c [\n2 v 5 3 4 9 1 . 5 0 3 2 : v i X r a\nAdANNS: A Framework for Adaptive Semantic Search\nAniket Rege    Aditya Kusupati    Sharan Ranjit S  Alan Fan  Qingqing Cao , Sham Kakade  Prateek Jain  Ali Farhadi   University of Washington,  Google Research,  Harvard University {kusupati,ali}@cs.washington.edu, prajain@google.com\nAbstract\nWeb-scale search systems learn an encoder to embed a given query which is then hooked into an approximate nearest neighbor search (ANNS) pipeline to retrieve similar data points. To accurately capture tail queries and data points, learned representations typically are rigid, high-dimensional vectors that are generally used as-

In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/video/how_to_overcome_our_mistakes.mp4"

response_video_file: ExtractResponse = await client.extract_file(
    file_path=file_path, quality="low", chunk=True, wait=-1
)

response_video_file

print(response_video_file.document.content)

### From URL


In [None]:
from aurelio_sdk import ExtractResponse

# From URL
url = "https://arxiv.org/pdf/2408.15291"
response_pdf_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1, polling_interval=5
)

response_pdf_url

In [None]:
from aurelio_sdk import ExtractResponse

# From URL
url = "https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4"
response_video_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1
)

response_video_url

# Get document status and handle timeouts


In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

async with aiofiles.open(file_path, "rb") as f:
    file_content = await f.read()

response_pdf_file: ExtractResponse = await client.extract_file(
    file=file_content, quality="high", chunk=True, wait=10)

# Get document status and document id
print("Status:", response_pdf_file.status)
print("Document ID:", response_pdf_file.document.id)

In [None]:
# Get document status and response
document_response: ExtractResponse = await client.get_document(
    document_id=response_pdf_file.document.id, timeout=1
)

print("Status:", document_response.status)

In [None]:
# Use a pre-built function, which helps to avoid long hanging requests (Recommended)
document_response = await client.wait_for(
    document_id=response_pdf_file.document.id, wait=300
)

In [None]:
document_response