##### In this example, we will use the `aryn-sdk` library to make an API call to Aryn Partitioning Service, save images, extract headers and text from the response

##### To access the Aryn Partitioning Service for fast, GPU-powered performance go to [aryn.ai/sign-up ](aryn.ai/sign-up) and sign up to get a free API key for the service. 


In [None]:
import json
import os
import subprocess
import sys
from IPython.display import Image
from PIL import Image
import base64

from pathlib import Path
from IPython.display import display, display_pdf, Image
from IPython.display import IFrame
from PIL import Image as PImage

from aryn_sdk.partition import partition_file, table_elem_to_dataframe, draw_with_boxes

In [None]:
if os.path.exists("/.dockerenv"):
    # Running in Docker.
    work_dir = "/app/work/docker_volume"
else:
    # Running outside of docker. This will land under notebooks/data/
    work_dir = "./data"
    

In [None]:
os.makedirs(work_dir, exist_ok = True)
metadata = {}
for f in ["1706.03762"]:
    path = os.path.join(work_dir, f + ".pdf")
    url = os.path.join("http://arxiv.org/pdf", f)
    if not Path(path).is_file():
        print("Downloading {} to {}".format(url, path))
        subprocess.run(["curl", "-o", path, url])


In [None]:
from aryn_sdk.config import ArynConfig, _DEFAULT_PATH
aryn_config=ArynConfig(aryn_config_path=_DEFAULT_PATH)

assert aryn_config.api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"


if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
def partition_filepath(filelocation, api_key=None, **options):
    with open(filelocation, "rb") as f:
        return partition_file(f, api_key, **options)

cURL command can also be used to call Aryn API for pdf processing

`curl --location 'https://api.aryn.cloud/v1/document/partition' \
--header 'Authorization: Bearer YOUR-ARYN-API-KEY' \
--form 'pdf=@"./data/1706.03762.pdf"' \
--form 'options="{\"threshold\":0.4,\"extract_table_structure\":true,\"extract_images\":true}"`

In [None]:
! curl --location 'https://api.aryn.cloud/v1/document/partition' --header 'Authorization: Bearer YOUR-ARYN-API-KEY' --form 'pdf=@"./data/1706.03762.pdf"' --form 'options="{\"threshold\":0.4,\"extract_table_structure\":false,\"extract_images\":false}"'

In [None]:
response_json = partition_filepath(
    path,
    threshold = 0.4,
    extract_table_structure = True,
    extract_images = True,
    use_ocr = True,
)

### Extract images from pdf using ArynPartitioner

In [None]:
from PIL import Image
print_next = False
count = 1

for element in response_json.get('elements',""):
    if element['type']=='Image':
        Image_binary_string = element['binary_representation']
        Image_bytes = base64.b64decode(Image_binary_string)
        img = Image.frombytes(mode='RGB',size = element['properties']['image_size'],data= Image_bytes)
        img.save(f'./data/output_image_{count}.jpg')
        count +=1

###  Extract headers in the pdf using ArynPartitioner along with their page number

In [None]:
from PIL import Image


print_next = False
count = 1

def sep_line():
    print("-" * 80)  
    
sep_line()
print(f"| {'Topic Name':<60} | {'Page Number':<15}| ")
sep_line()

sep_line()
for element in response_json.get('elements',""):
    if element['type']=='Section-header':
        print( f"| {element['text_representation'][:-1]:<60} | {element['properties']['page_number']:<15}| ")

sep_line()  # Separator line

### Get Text from a particular Header

In [None]:
def get_text_from_heading(topic_name):
    print_next = False
    references_found = False
    for element in response_json.get('elements',""):
        if element['type']=='Section-header':
            if topic_name in element['text_representation']:
                references_found = True
            elif references_found == True and element['type']=='Section-header':
                references_found = False
        if references_found and element.get('text_representation',False)  :
            print( f"{element['text_representation'][:]} ")
        
    
get_text_from_heading('Conclusion') 

### Visualize the bounding boxes on a page

In [None]:
from IPython.display import display

page_images = draw_with_boxes(path, response_json)
display(page_images[2])

In [None]:
tables = [elt for elt in response_json['elements'] if elt['type'] == 'table']
table_elem_to_dataframe(tables[3])