In [1]:
import sys

sys.path.append("..")

In [2]:
from aurelio_sdk import AsyncAurelioClient
import os


base_url = "http://localhost:8000"
client = AsyncAurelioClient(api_key=os.environ["AURELIO_API_KEY"], base_url=base_url, debug=True)

# Extract text from PDFs and video MP4 files and urls


### From file


In [None]:
# PDF

from aurelio_sdk import ExtractResponse

file_path = "data/pdf/adaptive_semantic_search.pdf"

# NOTE: wait=-1 means wait till completion, polling_interval=15 means polling every 15 seconds for status till completion

response_pdf_file: ExtractResponse = await client.extract_file(
    file_path=file_path, quality="high", chunk=True, wait=-1, polling_interval=15
)

response_pdf_file

In [None]:
# Video
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/video/how_to_overcome_our_mistakes.mp4"

response_video_file: ExtractResponse = await client.extract_file(
    file_path=file_path, quality="low", chunk=True, wait=-1
)

response_video_file

print(response_video_file)

### From URL


In [3]:
# From URL PDF
from aurelio_sdk import ExtractResponse

# From URL
url = "https://arxiv.org/pdf/2408.15291"
response_pdf_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1, polling_interval=5
)
response_pdf_url

[36m[AurelioSDK] [DEBUG] -- 2024-11-12 13:13:16 - at client_async.py:434 in extract_url(): Retrying due to exception (attempt 1): Cannot connect to host localhost:8001 ssl:default [Multiple exceptions: [Errno 61] Connect call failed ('::1', 8001, 0, 0), [Errno 61] Connect call failed ('127.0.0.1', 8001)][0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-12 13:13:16 - at client_async.py:434 in extract_url(): Retrying due to exception (attempt 2): Cannot connect to host localhost:8001 ssl:default [Multiple exceptions: [Errno 61] Connect call failed ('127.0.0.1', 8001), [Errno 61] Connect call failed ('::1', 8001, 0, 0)][0m


ApiError: [AurelioSDK] API request failed: Failed to get response after 3 retries: Cannot connect to host localhost:8001 ssl:default [Multiple exceptions: [Errno 61] Connect call failed ('::1', 8001, 0, 0), [Errno 61] Connect call failed ('127.0.0.1', 8001)]. Base API URL: http://localhost:8001.

In [None]:

response_pdf_url.document.chunks[0].num_tokens

In [None]:
# From URL Video
from aurelio_sdk import ExtractResponse

# From URL
url = "https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4"
response_video_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1
)

response_video_url

In [None]:
video_response_dict = response_video_url.model_dump()
video_response_dict["document"]["chunks"][0]["metadata"]["start_time"]


# Get document status and handle timeouts


In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

async with aiofiles.open(file_path, "rb") as f:
    file_content = await f.read()

response_pdf_file: ExtractResponse = await client.extract_file(
    file=file_content, quality="high", chunk=True, wait=10)

# Get document status and document id
print("Status:", response_pdf_file.status)
print("Document ID:", response_pdf_file.document.id)

In [None]:
# Get document status and response
document_response: ExtractResponse = await client.get_document(
    document_id=response_pdf_file.document.id, timeout=1
)

print("Status:", document_response.status)

In [None]:
# Use a pre-built function, which helps to avoid long hanging requests (Recommended)
document_response = await client.wait_for(
    document_id=response_pdf_file.document.id, wait=300
)

In [None]:
document_response