In [None]:
import sys

sys.path.append("..")

In [None]:
from aurelio_sdk import AurelioClient
import os

client = AurelioClient(api_key=os.environ["AURELIO_API_KEY"])

# Extract text from PDFs and video MP4 files and urls

### From file

In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

with open(file_path, "rb") as file:
    # timeout -1 means no timeout
    response_pdf_file: ExtractResponse = client.extract_file(
        file=file, quality="low", chunk=True, timeout=-1
    )

response_pdf_file

In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/video/how_to_overcome_our_mistakes.mp4"

with open(file_path, "rb") as file:
    # timeout -1 means no timeout
    response_video_file: ExtractResponse = client.extract_file(
        file=file, quality="low", chunk=True, timeout=-1
    )

response_video_file

### From URL

In [None]:
from aurelio_sdk import ExtractResponse

# From URL
url = "https://arxiv.org/pdf/2408.15291"
response_pdf_url: ExtractResponse = client.extract_url(
    url=url, quality="low", chunk=True, timeout=-1
)

response_pdf_url

In [None]:
from aurelio_sdk import ExtractResponse

# From URL
url = "https://storage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4"
response_video_url: ExtractResponse = client.extract_url(
    url=url, quality="low", chunk=True, timeout=-1
)

response_video_url

# Get document status and handle timeouts

In [None]:
from aurelio_sdk import ExtractResponse

# From a local file
file_path = "data/pdf/adaptive_semantic_search.pdf"

with open(file_path, "rb") as file:
    # Load file high quality and set timeout 10 seconds
    response_pdf_file: ExtractResponse = client.extract_file(
        file=file, quality="high", chunk=True, timeout=10
    )

# Get document status, message and document id
print("Status:", response_pdf_file.status)
print("Message:", response_pdf_file.message)
print("Document ID:", response_pdf_file.document.id)

In [None]:
# Get document status and response
document_response: ExtractResponse = client.get_document(
    document_id=response_pdf_file.document.id
)

print("Status:", document_response.status)

In [None]:
# Loop until the document is completed or timeout is reached
import time

start_time = time.time()
timeout = 300  # 5 minutes

while document_response.status != "completed" and time.time() - start_time < timeout:
    print(f"Status: {document_response.status}")
    time.sleep(1)
    document_response: ExtractResponse = client.get_document(
        document_id=response_pdf_file.document.id
    )

print(
    f"Final status: {'completed' if document_response.status == 'completed' else 'timed out'}"
)

document_response