In [None]:
import sys

sys.path.append("..")

In [None]:
from aurelio_sdk import AsyncAurelioClient
import os

client = AsyncAurelioClient(api_key=os.environ["AURELIO_API_KEY"], base_url="http://localhost:8000", debug=True)

# Extract text from PDFs and video MP4 files and urls

### From file

In [None]:
from aurelio_sdk import ExtractResponse
import aiofiles


file_path = "data/pdf/adaptive_semantic_search.pdf"

async with aiofiles.open(file_path, "rb") as f:
    file_content = await f.read()

response_pdf_file: ExtractResponse = await client.extract_file(
    file=file_content, quality="low", chunk=True, wait=-1, enable_polling=True
)

response_pdf_file

In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/video/how_to_overcome_our_mistakes.mp4"

async with aiofiles.open(file_path, "rb") as f:
    file_content = await f.read()

response_video_file: ExtractResponse = await client.extract_file(
    file=file_content, quality="low", chunk=True, wait=-1, enable_polling=True
)

response_video_file

print(response_video_file.document.content)

#### Streaming example for a large file

In [None]:
import json
import aiofiles
from aurelio_sdk import ExtractResponse

# Define the chunk size (1 MB)
CHUNK_SIZE = 1 * 1024 * 1024

async def file_stream_generator(file_path, chunk_size=CHUNK_SIZE):
    async with aiofiles.open(file_path, "rb") as f:
        while True:
            print("Reading chunk")
            chunk = await f.read(chunk_size)
            if not chunk:
                break
            yield chunk
            print("Sent chunk")

file_path = "data/video/how_to_overcome_our_mistakes.mp4"

# Create the file stream generator
file_stream = file_stream_generator(file_path)

# Call the extract_file method with the file stream
response_video_file: ExtractResponse = await client.extract_file(
    file=file_stream,
    quality="low",
    chunk=True,
    wait=-1,
    enable_polling=True
)

print(json.dumps(response_video_file.document.content, indent=2))

### From URL

In [None]:
from aurelio_sdk import ExtractResponse

# From URL
url = "https://arxiv.org/pdf/2408.15291"
response_pdf_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1, enable_polling=True
)

response_pdf_url

In [None]:
from aurelio_sdk import ExtractResponse

# From URL
url = "https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4"
response_video_url: ExtractResponse = await client.extract_url(
    url=url, quality="low", chunk=True, wait=-1, enable_polling=True
)

response_video_url

# Get document status and handle timeouts

In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

async with aiofiles.open(file_path, "rb") as f:
    file_content = await f.read()

response_pdf_file: ExtractResponse = await client.extract_file(
    file=file_content, quality="high", chunk=True, wait=10, enable_polling=True
)

# Get document status and document id
print("Status:", response_pdf_file.status)
print("Document ID:", response_pdf_file.document.id)

In [None]:
# Get document status and response
document_response: ExtractResponse = await client.get_document(
    document_id=response_pdf_file.document.id, timeout=1
)

print("Status:", document_response.status)

In [None]:
# Use a pre-built function, which helps to avoid long hanging requests (Recommended)
document_response = await client.wait_for(
    document_id=response_pdf_file.document.id, wait=300
)

In [None]:
document_response