In [1]:
import sys

sys.path.append("..")

In [2]:
from aurelio_sdk import AsyncAurelioClient
import os

client = AsyncAurelioClient(api_key=os.environ["AURELIO_API_KEY"])

# Extract text from PDFs and video MP4 files and urls

### From file

In [3]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

with open(file_path, "rb") as file:
    # timeout -1 means no timeout, default is 30 seconds
    response_pdf_file: ExtractResponse = await client.extract_file(
        file=file, quality="low", chunk=True, timeout=-1
    )

response_pdf_file

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=23071, pages=25, seconds=None), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_65e7b298-0fe1-4f63-b444-05de227e1025', content="3 2 0 2\nt c O 8 1\nG L . s c [\n2 v 5 3 4 9 1 . 5 0 3 2 : v i X r a\nAdANNS: A Framework for Adaptive Semantic Search\nAniket Rege    Aditya Kusupati    Sharan Ranjit S  Alan Fan  Qingqing Cao , Sham Kakade  Prateek Jain  Ali Farhadi   University of Washington,  Google Research,  Harvard University {kusupati,ali}@cs.washington.edu, prajain@google.com\nAbstract\nWeb-scale search systems learn an encoder to embed a given query which is then hooked into an approximate nearest neighbor search (ANNS) pipeline to retrieve similar data points. To accurately capture tail queries and data points, learned representations typically are rigid, high-dimensional vectors that are generally used as-is 

In [4]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/video/how_to_overcome_our_mistakes.mp4"

with open(file_path, "rb") as file:
    # timeout -1 means no timeout, default is 30 seconds
    response_video_file: ExtractResponse = await client.extract_file(
        file=file, quality="low", chunk=True, timeout=-1
    )

response_video_file

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=838, pages=None, seconds=291), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_5f2a5e87-2382-4036-85d6-e7a3aecc3153', content=" In a 2019 study, over 400 participants were enlisted to learn a mysterious, invented language. Individuals were asked about three pairs of runes. For example, which of these two characters represents an animal? Then, after a brief break, they were asked about the same roon pairs, with questions flipped. as in which of these two runes represents a non-living object. But this game had a secret.  The subject's answers in round one determined the rune's meanings in round two. In the first round, participants either had all their answers marked as correct, no matter what, or they were forced to fail every question. This meant that at the break, every participant had the same amount of informa

### From URL

In [5]:
from aurelio_sdk import ExtractResponse

# From URL
url = "https://arxiv.org/pdf/2408.15291"
response_pdf_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, timeout=-1
)

response_pdf_url

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=11875, pages=8, seconds=None), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_797e46e3-529f-4c5d-91d0-43a957e9975e', content='4 2 0 2\ng u A 6 2\nR S . h p - o r t s a [\n1 v 1 9 2 5 1 . 8 0 4 2 : v i X r a\nA temperature scale of 1 2 eV in the mass-radius relationship of white dwarfs of type DA\nJin Lima, Ji-Yu Kima, Maurice H.P.M. van Puttena,1,\naPhysics and Astronomy, Sejong University, 209 Neungdong-ro, 05006, Seoul, South Korea\nAbstract\nThe mass-radius relationship of white dwarfs (WDs) is one of their defining characteristics, largely derived from electron degen- eracy pressure. We present a model-independent study of the observed mass-radius relationship in WD binaries of Parsons et al. (2017), listing data over a broad temperature range up to about 60,000 K (5 eV). The data show an appreciable tempera

In [6]:
from aurelio_sdk import ExtractResponse

# From URL
url = "https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4"
response_video_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, timeout=-1
)

response_video_url

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=10, pages=None, seconds=15), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.low: 'low'>), document=ResponseDocument(id='doc_f3fdfb3e-6e74-425f-a28c-3f7b186b6d4b', content=" I'm a monster! I'm a monster!", source='https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4', source_type=<SourceType.video_mp4: 'video/mp4'>, num_chunks=1, metadata={}, chunks=[ResponseChunk(id='chunk_aedc3fd7-0d28-4485-8889-4fb5852de299', content="I'm a monster! I'm a monster!", chunk_index=1, num_tokens=10, metadata={'start_time': 0, 'end_time': 7})]))

# Get document status and handle timeouts

In [8]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

with open(file_path, "rb") as file:
    # Load file high quality and set timeout 10 seconds
    response_pdf_file: ExtractResponse = await client.extract_file(
        file=file, quality="high", chunk=True, timeout=10
    )

# Get document status, message and document id
print("Status:", response_pdf_file.status)
print("Message:", response_pdf_file.message)
print("Document ID:", response_pdf_file.document.id)

Status: TaskStatus.pending
Message: Processing is taking longer than the timeout of 10s. Check the status: GET /extract/document/doc_4842138e-4018-4ffa-b83d-3a317f5838a8
Document ID: doc_4842138e-4018-4ffa-b83d-3a317f5838a8


In [10]:
# Get document status and response
document_response: ExtractResponse = await client.get_document(
    document_id=response_pdf_file.document.id
)

print("Status:", document_response.status)

Status: TaskStatus.pending


In [11]:
# Or use a pre-built function, which helps to avoid long hanging requests (Recommended)
document_response = await client.wait_for_document_completion(
    document_id=response_pdf_file.document.id, timeout=300
)

In [12]:
document_response

ExtractResponse(status=<TaskStatus.completed: 'completed'>, usage=Usage(tokens=23737, pages=25, seconds=None), message=None, processing_options=ExtractProcessingOptions(chunk=True, quality=<ProcessingQuality.high: 'high'>), document=ResponseDocument(id='doc_4842138e-4018-4ffa-b83d-3a317f5838a8', content='3 2 0 2\nt c O 8 1 ] G L . s c [ 2 v 5 3 4 9 1 . 5 0 3 2 : v i X r a\nAdANNS: A Framework for Adaptive Semantic Search\nAniket Rege    Aditya Kusupati    Sharan Ranjit S  Alan Fan  Qingqing Cao , Sham Kakade  Prateek Jain  Ali Farhadi   University of Washington,  Google Research,  Harvard University {kusupati,ali}@cs.washington.edu, prajain@google.com\nAbstract\nWeb-scale search systems learn an encoder to embed a given query which is then hooked into an approximate nearest neighbor search (ANNS) pipeline to retrieve similar data points. To accurately capture tail queries and data points, learned representations typically are rigid, high-dimensional vectors that are generally used as-i