In [1]:
import sys

sys.path.append("..")

In [2]:
from aurelio_sdk import AsyncAurelioClient
import os


base_url = "http://localhost:8000"
client = AsyncAurelioClient(api_key=os.environ["AURELIO_API_KEY"], base_url=base_url, debug=True)

# Extract text from PDFs and video MP4 files and urls


### From file


In [None]:
# PDF

from aurelio_sdk import ExtractResponse

file_path = "data/pdf/adaptive_semantic_search.pdf"

# NOTE: wait=-1 means wait till completion, polling_interval=15 means polling every 15 seconds for status till completion

response_pdf_file: ExtractResponse = await client.extract_file(
    file_path=file_path, quality="high", chunk=True, wait=-1, polling_interval=15
)

response_pdf_file

In [None]:
# Video
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/video/how_to_overcome_our_mistakes.mp4"

response_video_file: ExtractResponse = await client.extract_file(
    file_path=file_path, quality="low", chunk=True, wait=-1
)

response_video_file

print(response_video_file)

### From URL


In [None]:
# From URL PDF
from aurelio_sdk import ExtractResponse

# From URL
url = "https://arxiv.org/pdf/2408.15291"
response_pdf_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1, polling_interval=5
)
response_pdf_url

In [None]:

response_pdf_url.document.chunks[0].num_tokens

In [3]:
# From URL Video
from aurelio_sdk import ExtractResponse

# From URL
url = "https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4"
response_video_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1
)

response_video_url

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=10, pages=None, seconds=15), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_8ac14a76-b73a-49d0-862f-f16652c9678a', content=" I'm a monster! I'm a monster!", source='https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4', source_type=<SourceType.video_mp4: 'video/mp4'>, num_chunks=1, metadata={}, chunks=[ResponseChunk(id='chunk_0b19311a-6d6f-45ca-9592-848fbdb604cf', content="I'm a monster! I'm a monster!", chunk_index=1, num_tokens=10, metadata={'start_time': 0, 'end_time': 7})]))

In [5]:
video_response_dict = response_video_url.model_dump()
video_response_dict["document"]["chunks"][0]["metadata"]["start_time"]


0

# Get document status and handle timeouts


In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

async with aiofiles.open(file_path, "rb") as f:
    file_content = await f.read()

response_pdf_file: ExtractResponse = await client.extract_file(
    file=file_content, quality="high", chunk=True, wait=10)

# Get document status and document id
print("Status:", response_pdf_file.status)
print("Document ID:", response_pdf_file.document.id)

In [None]:
# Get document status and response
document_response: ExtractResponse = await client.get_document(
    document_id=response_pdf_file.document.id, timeout=1
)

print("Status:", document_response.status)

In [None]:
# Use a pre-built function, which helps to avoid long hanging requests (Recommended)
document_response = await client.wait_for(
    document_id=response_pdf_file.document.id, wait=300
)

In [None]:
document_response