##### In this example, we will use request library to make API call to Aryn Partitioning Service and save image, extract header and text from the response

##### To access the Aryn Partitioning Service for fast, GPU-powered performance go to [aryn.ai/sign-up ](aryn.ai/sign-up) and sign up to get a free API key for the service. 


In [None]:
import json
import os
import subprocess
import sys
from IPython.display import Image
from PIL import Image
import base64

from pathlib import Path
from IPython.display import display, display_pdf, Image
from IPython.display import IFrame
from PIL import Image as PImage

import sycamore
from sycamore.data import Document
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.llms import OpenAIModels, OpenAI, LLM
from sycamore.transforms.partition import ArynPartitioner
from sycamore.llms.prompts.default_prompts import TEXT_SUMMARIZER_GUIDANCE_PROMPT_CHAT
from sycamore.transforms.summarize import Summarizer
from sycamore.transforms.extract_table import TextractTableExtractor
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.tests.config import TEST_DIR
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.functions.tokenizer import HuggingFaceTokenizer
from sycamore.connectors.file.file_scan import JsonManifestMetadataProvider

Replace the `YOUR-ARYN-API-KEY` with your key 

In [None]:
aryn_api_key = 'YOUR-ARYN-API-KEY'

cURL command can also be used to call Aryn API for pdf processing

`curl --location 'https://api.aryn.cloud/v1/document/partition' \
--header 'Authorization: Bearer YOUR-ARYN-API-KEY' \
--form 'pdf=@"/Users/sheb/Downloads/transformer.pdf"' \
--form 'options="{\"threshold\":0.4,\"extract_table_structure\":true,\"extract_images\":true}"`

In [None]:
import requests
import mimetypes

def request_Aryn_API(apikey, filelocation, options):
    url = "https://api.aryn.cloud/v1/document/partition"
    
    with open(filelocation, 'rb') as file:
        file_type = 'application/pdf' or 'application/octet-stream'
        files = {
            'pdf': (filelocation.split('/')[-1], file, file_type),
            'options': (None, options, 'text/plain')
        }
        
        headers = {
            'Authorization': f'Bearer {apikey}',
        }
        
        response = requests.post(url, files=files, headers=headers)
        return response.text


In [None]:
if os.path.exists("/.dockerenv"):
    # Running in Docker.
    work_dir = "/app/work/docker_volume"
else:
    # Running outside of docker. This will land under notebooks/data/
    work_dir = "./data"
    

In [None]:
os.makedirs(work_dir, exist_ok = True)
metadata = {}
for f in ["1706.03762"]:
    path = os.path.join(work_dir, f + ".pdf")
    url = os.path.join("http://arxiv.org/pdf", f)
    if not Path(path).is_file():
        print("Downloading {} to {}".format(url, path))
        subprocess.run(["curl", "-o", path, url])


In [None]:
options = '{"threshold":0.4,"extract_table_structure":true,"extract_images":true}'
response = request_Aryn_API(aryn_api_key, path, options)


In [None]:
response_json = json.loads(response)
response_json

### Extract images from pdf using ArynPartitioner

In [None]:
from PIL import Image
print_next = False
count = 1

for element in response_json.get('elements',""):
    if element['type']=='Image':
        Image_binary_string = element['binary_representation']
        Image_bytes = base64.b64decode(Image_binary_string)
        img = Image.frombytes(mode='RGB',size = element['properties']['image_size'],data= Image_bytes)
        img.save(f'./data/output_image_{count}.jpg')
        count +=1

###  Extract headers in the pdf using ArynPartitioner along with their page number

In [None]:
from PIL import Image


print_next = False
count = 1

def sep_line():
    print("-" * 80)  
    
sep_line()
print(f"| {'Topic Name':<60} | {'Page Number':<15}| ")
sep_line()

sep_line()
for element in response_json.get('elements',""):
    if element['type']=='Section-header':
        print( f"| {element['text_representation'][:-1]:<60} | {element['properties']['page_number']:<15}| ")

sep_line()  # Separator line

### Get Text from a particular Header

In [None]:
def get_text_from_heading(topic_name):
    print_next = False
    count = 1
    references_found = False
    for element in response_json.get('elements',""):
        if element['type']=='Section-header':
            if topic_name in element['text_representation']:
                references_found = True
            else:
                references_found = False
        if references_found and element.get('text_representation',False)  :
            print( f"{element['text_representation'][:]} ")
    
get_text_from_heading('Why Self-Attention') 