In [2]:
import sys

sys.path.append("..")

In [3]:
from aurelio_sdk import AsyncAurelioClient
import os


base_url = "http://localhost:8000"
client = AsyncAurelioClient(api_key=os.environ["AURELIO_API_KEY"], base_url=base_url, debug=True)

# Extract text from PDFs and video MP4 files and urls


### From file


In [None]:
# PDF

from aurelio_sdk import ExtractResponse

file_path = "data/pdf/adaptive_semantic_search.pdf"

# NOTE: wait=-1 means wait till completion, polling_interval=15 means polling every 15 seconds for status till completion

response_pdf_file: ExtractResponse = await client.extract_file(
    file_path=file_path, quality="high", chunk=True, wait=-1, polling_interval=15
)

response_pdf_file

In [10]:
# Video
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/video/how_to_overcome_our_mistakes.mp4"

response_video_file: ExtractResponse = await client.extract_file(
    file_path=file_path, quality="low", chunk=True, wait=-1
)

response_video_file

print(response_video_file)

[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:22 - at client_async.py:181 in extract_file(): Uploading file from path, data/video/how_to_overcome_our_mistakes.mp4[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:22 - at client_async.py:488 in _file_stream_generator(): Reading chunk 1, chunk_size: 41943040, total bytes: 8258456[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:22 - at client_async.py:491 in _file_stream_generator(): Stream finished, total chunks: 1, file size: 7.88 MB[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:23 - at client_async.py:397 in wait_for(): Starting polling for document completion: doc_b864cb88-8095-4ce4-85b4-d6280ed800aa[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:28 - at client_async.py:417 in wait_for(): Polling document doc_b864cb88-8095-4ce4-85b4-d6280ed800aa: status=TaskStatus.pending[0m
[36m[AurelioSDK] [DEBUG] -- 2024-11-07 12:06:33 - at client_async.py:417 in wait_for(): Polling document doc_b864cb88-8095-4ce4-85b4-d6280ed800aa: stat

status=<TaskStatus.completed: 'completed'> usage=Usage(tokens=838, pages=None, seconds=291) message=None processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>) document=ResponseDocument(id='doc_b864cb88-8095-4ce4-85b4-d6280ed800aa', content=" In a 2019 study, over 400 participants were enlisted to learn a mysterious, invented language. Individuals were asked about three pairs of runes. For example, which of these two characters represents an animal? Then, after a brief break, they were asked about the same roon pairs, with questions flipped. as in which of these two runes represents a non-living object. But this game had a secret.  enrolled in introductory and advanced French courses. These students completed a questionnaire asking what kind of teacher they preferred, one who emphasized their strength and successes, or one who highlighted their mistakes and corrected their weaknesses. In general, responses showed that while beginner students sou

### From URL


In [6]:
# From URL PDF
from aurelio_sdk import ExtractResponse

# From URL
url = "https://arxiv.org/pdf/2408.15291"
response_pdf_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1, polling_interval=5
)
response_pdf_url

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=11875, pages=8, seconds=None), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_b3c94774-dace-4ab3-8af1-b4a1c8acfb19', content='4 2 0 2\ng u A 6 2\nR S . h p - o r t s a [\n1 v 1 9 2 5 1 . 8 0 4 2 : v i X r a\nA temperature scale of 1 2 eV in the mass-radius relationship of white dwarfs of type DA\nJin Lima, Ji-Yu Kima, Maurice H.P.M. van Puttena,1,\naPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea\nAbstract\nThe mass-radius relationship of white dwarfs (WDs) is one of their defining characteristics, largely derived from electron degen- eracy pressure. We present a model-independent study of the observed mass-radius relationship in WD binaries of Parsons et al. (2017), listing data over a broad temperature range up to about 60,000 K (5 eV). The data show an appreciable tempera

In [8]:

response_pdf_url.document.chunks[0].num_tokens

283

In [9]:
# From URL Video
from aurelio_sdk import ExtractResponse

# From URL
url = "https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4"
response_video_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1
)

response_video_url

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=10, pages=None, seconds=15), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_ead3c92e-efa7-4355-bdb6-3fc3a4c56ba5', content=" I'm a monster! I'm a monster!", source='https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4', source_type=<SourceType.video_mp4: 'video/mp4'>, num_chunks=1, metadata={}, chunks=[ResponseChunk(id='chunk_c69eea8d-fdbe-41f9-8218-250ec9ee63fe', content="I'm a monster! I'm a monster!", chunk_index=1, num_tokens=10, metadata={'start_time': 0, 'end_time': 7})]))

# Get document status and handle timeouts


In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

async with aiofiles.open(file_path, "rb") as f:
    file_content = await f.read()

response_pdf_file: ExtractResponse = await client.extract_file(
    file=file_content, quality="high", chunk=True, wait=10)

# Get document status and document id
print("Status:", response_pdf_file.status)
print("Document ID:", response_pdf_file.document.id)

In [None]:
# Get document status and response
document_response: ExtractResponse = await client.get_document(
    document_id=response_pdf_file.document.id, timeout=1
)

print("Status:", document_response.status)

In [None]:
# Use a pre-built function, which helps to avoid long hanging requests (Recommended)
document_response = await client.wait_for(
    document_id=response_pdf_file.document.id, wait=300
)

In [None]:
document_response