***Using GenAI Foundational Platform Endpoints for Summarization (Using Claude 2)***

Following is sample that summarizes each individual page of a document and creates a summary of summaries.

Before you begin, make sure you create a .env file in the same folder as the notebook, and have the following variables. You can get the values for these variables from your admin of the platform.

 
 COGNITO_CLIENT_ID='<replace_me>'

 COGNITO_CLIENT_SECRET='<replace_me>'

 COGNITO_USER_POOL_ID='<replace_me>'

 COGNITO_REGION='<replace_me>'

 COGNITO_DOMAIN='<replace_me>'
 
 PLATFORM_API_URL='<replace_me>'




***Note .env file is only needed when running a notebook. In a real application deployed to EC2 or container, you can just create environment variables. (For example using export command)***

Install the requirements

In [None]:
pip install -r reqs.txt

Load the environment variables

In [None]:
import os
import dotenv
import pprint
# Load the environment variables. This is only necessary if you are using a .env file to store your credentials.
dotenv.load_dotenv()

Inititalize values from env variables

In [None]:
import os
APP_CLIENT_ID = os.getenv('COGNITO_CLIENT_ID')
APP_USER_POOL_ID = os.getenv('COGNITO_USER_POOL_ID')
APP_CLIENT_SECRET = os.getenv('COGNITO_CLIENT_SECRET')
REGION = os.getenv('COGNITO_REGION')
DOMAIN = os.getenv('COGNITO_DOMAIN')
BASE_URL = os.getenv('PLATFORM_API_URL')

We create reusable get and post methods to make API calls to the platform

In [None]:
import requests
import json
import boto3

def get(proxy=None, token=None):
    url = BASE_URL
    if proxy:
        url = BASE_URL + '/' + proxy

    if token:
        headers = {
            'Authorization': f'Bearer {token}'
        }
    response = requests.get(url, headers=headers, timeout=60)
    response.raise_for_status()
    return response

def post(data, proxy=None, token=None):
    url = BASE_URL
    if proxy:
        url = BASE_URL + '/' + proxy
    
    if token:
        headers = {
            'Authorization': f'Bearer {token}'
        }
    response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
    response.raise_for_status()
    return response

Authenticate with cognito and get the access token. We use this token in the header to make calls to the platform.

In [None]:
from utils import CognitoTokenManager, get_cognito_public_keys
import pprint
cognito_token_manager = CognitoTokenManager(APP_CLIENT_ID, APP_CLIENT_SECRET, APP_USER_POOL_ID, REGION, DOMAIN)
token = cognito_token_manager._fetch_token_with_secret()

#### Document Extraction

Create Extraction Job

In [None]:
create_extraction_job_endpoint = 'document/extraction/create_job'
extraction_job = get(proxy=create_extraction_job_endpoint, token=token)
pprint.pprint(extraction_job.json())

Register Files to the Job

In [None]:
register_file_endpoint = 'document/extraction/register_file'
file_name = '<REPLACE_WITH_YOUR_FILE_PATH>' # eg. 'data/your_file.pdf'
data = { 
    "extraction_job_id": extraction_job.json()['extraction_job_id'], 
    "file_name": file_name
}
response = post(proxy=register_file_endpoint, token=token, data=data)
pprint.pprint(response.json())

Upload the files using presigned urls

In [None]:
## Upload the file to the S3 bucket
pre_signed_url = response.json()['upload_url']
import requests
with open(file_name, 'rb') as f:
    response = requests.put(pre_signed_url, data=f)
    print(response.status_code)

Start Extraction Job

In [None]:
# start job
start_job_endpoint = 'document/extraction/start_job'
data = {
    "extraction_job_id": extraction_job.json()['extraction_job_id']
}
response = post(proxy=start_job_endpoint, token=token, data=data)
pprint.pprint(response.json())

Check Extraction Job Status

In [None]:
# /document/extraction/job_status/{extraction_job_id}
import time
job_status_endpoint = f'document/extraction/job_status/{extraction_job.json()["extraction_job_id"]}'
response = get(proxy=job_status_endpoint, token=token)
status = response.json()['status']
while status != 'COMPLETED' and status != 'FAILED' and status != 'COMPLETED_WITH_ERRORS':
    response = get(proxy=job_status_endpoint, token=token)
    status = response.json()['status']
    print(status)
    time.sleep(5)
pprint.pprint(response.json())

Get Extracted Text

In [None]:
# POST /document/extraction/file_status
file_status_endpoint = 'document/extraction/file_status'
data = {
    "extraction_job_id": extraction_job.json()['extraction_job_id'],
    "file_name": file_name
}
response = post(proxy=file_status_endpoint, token=token, data=data)
pprint.pprint(response.json())
result_url = response.json()['result_url']

# Get the result
response = requests.get(result_url)
print(response.status_code)
pprint.pprint(response.json())

#### Chunking

Create a chunking job. Chunking by page.

In [None]:
# POST /document/chunking/create_job
create_chunking_job_endpoint = 'document/chunking/create_job'
chunking_strategy = 'page'
chunk_size = 400
chunk_overlap = 100
data = {
    "extraction_job_id": extraction_job.json()['extraction_job_id'],
    "chunking_strategy": chunking_strategy
}
chunk_job = post(proxy=create_chunking_job_endpoint, token=token, data=data)
pprint.pprint(chunk_job.json())

Check Chunking Job Status

In [None]:
# GET /document/chunking/job_status/{job_id}
import time
job_status_endpoint = f'document/chunking/job_status/{chunk_job.json()["chunking_job_id"]}'
chunk_job_status = get(proxy=job_status_endpoint, token=token)
status = chunk_job_status.json()['status']
while status != 'COMPLETED' and status != 'FAILED' and status != 'COMPLETED_WITH_ERRORS':
    chunk_job_status = get(proxy=job_status_endpoint, token=token)
    status = chunk_job_status.json()['status']
    print(status)
    time.sleep(5)
pprint.pprint(chunk_job_status.json())

Get Chunks

In [None]:
# POST /document/chunking/chunk_file_url
chunk_file_url_endpoint = 'document/chunking/chunk_file_url'
data = {
    "chunking_job_id": chunk_job.json()['chunking_job_id'],
    "file_name": file_name
}
chunk_file = post(proxy=chunk_file_url_endpoint, token=token, data=data)
pprint.pprint(chunk_file.text)
chunk_file_url = chunk_file.json()['chunk_file_url']

# Get the chunked file
chunk_file_text = requests.get(chunk_file_url)
print(chunk_file_text.status_code)
pprint.pprint(chunk_file_text.json())

Print All Extracted Pages

In [None]:
pages = []
for chunk in chunk_file_text.json():
    pages.append(chunk['chunk'])

print(pages)

Create individual summary for each page

In [None]:
summary_prompt = "Summarize the following text in one or two paragraphs: {text}"
summaries = []
invoke_model_endpoint = 'model/invoke'
page_num = 1
for page in pages:
    prompt = summary_prompt.format(text=page)
    data = { 
        "model_name": "ANTHROPIC_CLAUDE_V2", 
        "prompt": prompt, 
        "max_tokens": 1000, 
        "temperature": 0.7, 
        "stop_sequences": ["\\n"] 
    }
    response = post(proxy=invoke_model_endpoint, token=token, data=data)
    print(f"Page {page_num}")
    page_num += 1
    print(response.json()['output_text'])
    summaries.append(response.json()['output_text'])

pprint.pprint(summaries)

    

Summary of summaries

In [None]:
all_summary = " ".join(summaries)
prompt = "Following is a list of summaries of each page of a document. Combine the summaries in to one summary of atleast 4 paragraphs. Don't miss details: {text}".format(text=all_summary)+ "Summary:"
data = { 
        "model_name": "ANTHROPIC_CLAUDE_V2", 
        "prompt": prompt, 
        "max_tokens": 1000, 
        "temperature": 0.7
    }
response = post(proxy=invoke_model_endpoint, token=token, data=data)
pprint.pprint(response.json()['output_text'])