## Long Form Summarization using `text-bison-32k` and `DocAI`

#### Imports 

In [1]:
from vertexai.preview.language_models import TextGenerationModel
from google.api_core.client_options import ClientOptions
from concurrent.futures import ThreadPoolExecutor
from google.cloud import documentai
from pypdf import PdfWriter
from pypdf import PdfReader 
from tqdm import tqdm
import tiktoken
import vertexai
import requests
import logging
import json
import os

##### Setup logging 

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

#### Essentials 

In [44]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './../credentials/vai-key.json'
access_token = !gcloud auth print-access-token

In [4]:
PROJECT_ID = 'arun-genai-bb'
LOCATION = 'us-central1'
MODEL_NAME = 'text-bison-32k@latest'
ENCODING_NAME = 'cl100k_base'
CONTEXT_LENGTH = 32000  # text-bison-32k
STREAMING_API_URL = f'https://us-central1-aiplatform.googleapis.com/ui/projects/{PROJECT_ID}/locations/us-central1/publishers/google/models/{MODEL_NAME}:serverStreamingPredict'
DOCAI_PROCESSOR_NAME = 'projects/390991481152/locations/us/processors/ad9557a5be49204e'  # copy from notebook 00
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [5]:
client_options = ClientOptions(api_endpoint=f'us-documentai.googleapis.com')
docai_client = documentai.DocumentProcessorServiceClient(client_options=client_options)

In [6]:
encoder = tiktoken.get_encoding(ENCODING_NAME)
logger.info(f'Using encoder=={encoder.name}')

Using encoder==cl100k_base


In [7]:
model = TextGenerationModel.from_pretrained(MODEL_NAME)
logger.info(f'Using model=={model._model_id}')

Using model==text-bison-32k@latest


#### Use Google DocumentAI to process input PDF

##### Break PDF into smaller PDFs for OCR

In [8]:
LOCAL_INPUT_DIR = './DATA/INPUT'
LOCAL_OUTPUT_DIR = './DATA/OUTPUT'
FILE_NAME = 'file-2'

In [9]:
reader = PdfReader(f'{LOCAL_INPUT_DIR}/{FILE_NAME}.pdf') 
pages = {}

for i, page in enumerate(reader.pages):
    pages[i] = page

In [10]:
n = len(reader.pages)
d = 15  # docai has a current constraint of 15 pages per document 
for i in range(0, n, d):
    writer = PdfWriter()
    for j in range(i, i+d):
        if j < n:
            writer.add_page(pages[j])
    os.makedirs(f'{LOCAL_INPUT_DIR}/{FILE_NAME}/PARTS/', exist_ok=True)
    with open(f'{LOCAL_INPUT_DIR}/{FILE_NAME}/PARTS/{FILE_NAME}_{i+1}-{i+d}.pdf', 'wb') as f:
        writer.write(f)

In [11]:
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document's text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will be stored in different text segments.
    return ''.join(text[int(segment.start_index): int(segment.end_index)] for segment in layout.text_anchor.text_segments)

In [12]:
def get_file_paths(dir_name: str) -> list:
    file_paths = []
    for file_name in os.listdir(dir_name):
        if os.path.isfile(os.path.join(dir_name, file_name)):
            file_path = os.path.join(dir_name, file_name)
            file_paths.append(file_path)
    return file_paths

In [13]:
def ocr_docai(file_path: str) -> dict:
    pages_map = {}

    with open(file_path, 'rb') as f:
        pdf = f.read()
        raw_document = documentai.RawDocument(content=pdf, mime_type='application/pdf')
        request = documentai.ProcessRequest(name=DOCAI_PROCESSOR_NAME, raw_document=raw_document)
        response = docai_client.process_document(request=request)
        text = response.document.text
        file_name = file_path.split('/')[-1]
        page_number = int(file_name.split('.')[0].split('-')[-1])
        for page in response.document.pages:
            page_text = []
            for paragraph in page.paragraphs:
                paragraph_text = layout_to_text(paragraph.layout, text)
                page_text.append(paragraph_text)
            pages_map[page_number] = ''.join(page_text)
            page_number += 1
    return pages_map

In [14]:
%%time 

input_dir = f'./DATA/INPUT/{FILE_NAME}/PARTS/'
file_paths = get_file_paths(input_dir)
    
pages_map_list = []
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:  
    pages_map_list = list(tqdm(executor.map(ocr_docai, file_paths)))

merged_dict = {k: v for d in pages_map_list for k, v in d.items()}   
sorted_pages_map = dict(sorted(merged_dict.items()))

pages = []
for _, page_text in sorted_pages_map.items():
    pages.append(page_text)

73it [00:48,  1.52it/s]

CPU times: user 3.35 s, sys: 4.07 s, total: 7.42 s
Wall time: 48.3 s





Save concatenated pages as txt for later use (if needed)

In [15]:
extracted_pages = ''.join(pages)
os.makedirs(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/VAI/', exist_ok=True)
with open(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/VAI/{FILE_NAME}.txt', 'w') as out:
    out.write(extracted_pages)

In [16]:
def get_total_tokens(contexts: list) -> int:
    total_tokens = 0
    for context in contexts:
        n_tokens = len(encoder.encode(context))
        total_tokens += n_tokens 
    return total_tokens

In [17]:
total_tokens = get_total_tokens([extracted_pages])
logger.info(f'Total tokens in the input doc = {total_tokens}')

Total tokens in the input doc = 414095


In [18]:
def get_max_tokens_per_page(contexts: list) -> list:
    max_tokens_per_page = 0
    for context in contexts:
        n_tokens = len(encoder.encode(context))
        if n_tokens > max_tokens_per_page:
            max_tokens_per_page = n_tokens
    return max_tokens_per_page

#### Map Reduce 1

In [19]:
def get_summary_via_streaming_api(chunk: str) -> str:
    prompt = f'You are a Financial Regulations & Derivatives Expert. Summarize the following information into five brief sentences in English, capturing the essential details.\n\n{chunk}'
    headers = {
        "Authorization": f"Bearer {access_token[0]}",
        "Content-Type": "application/json; charset=utf-8"
    }
    
    data = {
        "inputs": [
            {
                "struct_val": {
                    "prompt": {
                        "string_val": [prompt]
                    }
                }
            }
        ],
        "parameters": {
            "struct_val": {
                "temperature": {"float_val": 0.0},
                "maxOutputTokens": {"int_val": 256},
                "topK": {"int_val": 40},
                "topP": {"float_val": 0.8}
            }
        }
    }
    response = requests.post(STREAMING_API_URL, headers=headers, json=data)
    content = json.loads(response.content)
    output = []

    for item in content:
        try:
            text = item['outputs'][0]['structVal']['content']['stringVal'][0]
            output.append(text)
        except Exception as e:
            logger.error(f'Content error => {content}')
    output = ''.join(output)
    return output


In [20]:
%%time 

CONTEXTS_PER_CALL = 5  # process 5 pages per API call
MAX_OUTPUT_TOKENS = 256

def reduce(contexts: list) -> list:
    partitions = []
    max_input_tokens = CONTEXT_LENGTH - MAX_OUTPUT_TOKENS
    logger.info(f'Max input tokens allowed per API call = {max_input_tokens}')
    max_tokens_per_page = get_max_tokens_per_page(contexts)
    logger.info(f'Max tokens per page = {max_tokens_per_page}')
    logger.info(f'Processing {CONTEXTS_PER_CALL} pages per API call')
    
    for i in range(0, len(contexts), CONTEXTS_PER_CALL):
        partitions.append(contexts[i: i+CONTEXTS_PER_CALL])

    chunks = []
    for partition in partitions:
        chunks.append('\n'.join(partition))

    reduced_contexts = []

    # max_workers can result in running over quota limits for invocation | current limit for text bison is 60/min
    # for our experiments, we set max_workers=4 cores without any limit breach
    with ThreadPoolExecutor(max_workers=4) as executor:  
        reduced_contexts = list(tqdm(executor.map(get_summary_via_streaming_api, chunks),  total=len(chunks)))
    return reduced_contexts


CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.87 µs


In [21]:
logger.info(f'Number of pages to process = {len(pages)}')
summaries = reduce(pages)
logger.info(f'Number of generated summaries = {len(summaries)}')
n_tokens = get_total_tokens(summaries)
logger.info(f'Total number of tokens in generated summaries = {n_tokens}')

Number of pages to process = 1089
Max input tokens allowed per API call = 31744
Max tokens per page = 720
Processing 5 pages per API call
100%|██████████| 218/218 [11:11<00:00,  3.08s/it]
Number of generated summaries = 218
Total number of tokens in generated summaries = 36792


In [22]:
logger.info(summaries[5])
logger.info('-' * 100)
logger.info(summaries[15])
logger.info('-' * 100)
logger.info(summaries[20])

 1. The proposal revises the calculation of the stress capital buffer requirement for large banking organizations. 
2. Both the stress test losses and dividend add-on components would be calculated using the binding common equity tier 1 capital ratio. 
3. The proposal also amends the Board's stress testing and capital plan rules to require banking organizations to project their risk-based capital ratios in their company-run stress tests and capital plans using the calculation approach that results in the binding ratios. 
4. The use of the binding approach aims to conform company-run stress tests and capital plans with the binding risk-based capital ratios in the proposed capital rule and promote simplicity. 
5. The Board invites comment on the appropriate level of risk capture for the risk-weighted assets framework and the stress capital buffer requirement.
----------------------------------------------------------------------------------------------------
 1. The proposed definition o

##### Persist internediate summaries (Map Reduce 1) to local disk

In [23]:
logger.info(f'Total number of summaries = {len(summaries)}')

Total number of summaries = 218


In [24]:
for i, summary in enumerate(summaries):
    os.makedirs(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/VAI/MAP_REDUCE_1/', exist_ok=True)
    with open(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/VAI/MAP_REDUCE_1/summary-{i}.txt', 'w') as f:
        f.write(summary)

#### Map Reduce 2

In [25]:
def get_summary_via_streaming_api(context: str) -> str:
    prompt = f"""For the context below, create a consolidated refined short summary with the most important pointers only.\n\n{context}\n\nDo not repeat pointers. Breakdown the summary into SECTIONS. Make it crisp and concise."""
    headers = {
        "Authorization": f"Bearer {access_token[0]}",
        "Content-Type": "application/json; charset=utf-8"
    }
    
    data = {
        "inputs": [
            {
                "struct_val": {
                    "prompt": {
                        "string_val": [prompt]
                    }
                }
            }
        ],
        "parameters": {
            "struct_val": {
                "temperature": {"float_val": 0.0},
                "maxOutputTokens": {"int_val": 4096},
                "topK": {"int_val": 40},
                "topP": {"float_val": 0.8}
            }
        }
    }
    response = requests.post(STREAMING_API_URL, headers=headers, json=data)
    content = json.loads(response.content)
    output = []

    for item in content:
        try:
            text = item['outputs'][0]['structVal']['content']['stringVal'][0]
            output.append(text)
        except Exception as e:
            logger.error(f'Content error => {content}')
    output = ''.join(output)
    return output

In [26]:
%%time 

CONTEXTS_PER_CALL = 50  # process 50 summaries per API call

def reduce(contexts: list) -> list:
    partitions = []
    max_input_tokens = CONTEXT_LENGTH - MAX_OUTPUT_TOKENS
    logger.info(f'Max input tokens allowed per API call = {max_input_tokens}')
    max_tokens_per_page = get_max_tokens_per_page(contexts)
    logger.info(f'Max tokens per page = {max_tokens_per_page}')
    logger.info(f'Processing {CONTEXTS_PER_CALL} pages per API call')
    
    for i in range(0, len(contexts), CONTEXTS_PER_CALL):
        partitions.append(contexts[i: i+CONTEXTS_PER_CALL])

    chunks = []
    for partition in partitions:
        chunks.append('\n'.join(partition))
    logger.info(f'Total number of chunks of summaries = {len(chunks)}')

    reduced_contexts = []

    with ThreadPoolExecutor(max_workers=4) as executor:  
        reduced_contexts = list(tqdm(executor.map(get_summary_via_streaming_api, chunks),  total=len(chunks)))
    return reduced_contexts

CPU times: user 9 µs, sys: 2 µs, total: 11 µs
Wall time: 14.1 µs


In [27]:
reduced_summaries = reduce(summaries)

Max input tokens allowed per API call = 31744
Max tokens per page = 278
Processing 50 pages per API call
5
100%|██████████| 5/5 [04:41<00:00, 56.35s/it] 


In [28]:
logger.info(reduced_summaries[0])
logger.info('-' * 100)
logger.info(reduced_summaries[1])
logger.info('-' * 100)
logger.info(reduced_summaries[2])
logger.info('-' * 100)

 **Section 1: Capital Requirements for Large Banking Organizations**

The banking regulators propose to revise the capital requirements for large banking organizations and those with significant trading activity. 

The proposed revisions aim to improve risk-based capital requirements, reduce complexity, enhance consistency, and facilitate effective supervisory and market assessments. 

The proposal includes replacing current requirements that use internal models for credit and operational risk with standardized approaches. 

It also replaces current market risk and credit valuation adjustment risk requirements with revised approaches. 

The proposed revisions generally align with recent changes to international capital standards issued by the Basel Committee on Banking Supervision.

**Section 2: Standardized Approach for Credit Risk**

The proposal introduces a standardized approach for credit risk applicable to large banking organizations. 

It retains many of the same definitions fro

##### Persist internediate summaries (Map Reduce 2) to local disk

In [29]:
logger.info(f'Total number of summaries after map reduce 2 = {len(reduced_summaries)}')

Total number of summaries after map reduce 2 = 5


In [30]:
for i, summary in enumerate(reduced_summaries):
    os.makedirs(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/VAI/MAP_REDUCE_2/', exist_ok=True)
    with open(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/VAI/MAP_REDUCE_2/summary-{i}.txt', 'w') as f:
        f.write(summary)

In [31]:
consolidated_summaries = '\n'.join(reduced_summaries)
logger.info(get_total_tokens([consolidated_summaries]))

11204


In [32]:
logger.info(consolidated_summaries)

 **Section 1: Capital Requirements for Large Banking Organizations**

The banking regulators propose to revise the capital requirements for large banking organizations and those with significant trading activity. 

The proposed revisions aim to improve risk-based capital requirements, reduce complexity, enhance consistency, and facilitate effective supervisory and market assessments. 

The proposal includes replacing current requirements that use internal models for credit and operational risk with standardized approaches. 

It also replaces current market risk and credit valuation adjustment risk requirements with revised approaches. 

The proposed revisions generally align with recent changes to international capital standards issued by the Basel Committee on Banking Supervision.

**Section 2: Standardized Approach for Credit Risk**

The proposal introduces a standardized approach for credit risk applicable to large banking organizations. 

It retains many of the same definitions fro

#### Final Consolidation

In [37]:
def get_summary_via_streaming_api(context: str) -> str:
    prompt = f"""Given the context below, combine and merge duplicate sections and pointers.\n\n{context}\nAdd SECTIONS and bullets wherever needed. Clean rewrite and re-number sections."""
    headers = {
        "Authorization": f"Bearer {access_token[0]}",
        "Content-Type": "application/json; charset=utf-8"
    }
    
    data = {
        "inputs": [
            {
                "struct_val": {
                    "prompt": {
                        "string_val": [prompt]
                    }
                }
            }
        ],
        "parameters": {
            "struct_val": {
                "temperature": {"float_val": 0.0},
                "maxOutputTokens": {"int_val": 8192},
                "topK": {"int_val": 40},
                "topP": {"float_val": 0.8}
            }
        }
    }
    response = requests.post(STREAMING_API_URL, headers=headers, json=data)
    content = json.loads(response.content)
    output = []

    for item in content:
        try:
            text = item['outputs'][0]['structVal']['content']['stringVal'][0]
            output.append(text)
        except Exception as e:
            logger.error(f'Content error => {content}')
    output = ''.join(output)
    return output

In [38]:
final_summary = get_summary_via_streaming_api(consolidated_summaries)
logger.info(final_summary)

 SECTION 1: GENERAL REQUIREMENTS
- The proposal specifies capital treatment of internal CVA risk transfers.
- Banking organizations must maintain an internal written record of each internal derivative transaction.
- If the internal risk transfer is subject to curvature risk, default risk, or the residual risk add-on, the trading desk would have to execute an external transaction with a third party.
- The proposal sets forth general requirements for the recognition of CVA hedges that would be applicable to both internal transfers of CVA risk and external CVA hedges.
- The agencies propose to introduce the concept of a trading desk and apply the proposed internal models approach at the trading desk level.
- The proposal would define trading desk as a unit of organization of a banking organization that purchases or sells market risk covered positions and satisfies three requirements.
- The proposed trading desk definition is intended to help ensure that a banking organization structures i

In [39]:
MAX_OUTPUT_TOKENS = 8192
max_input_tokens = CONTEXT_LENGTH - MAX_OUTPUT_TOKENS
logger.info(f'Max input tokens allowed per API call = {max_input_tokens}')
logger.info(f'Total tokens in final summary = {get_total_tokens([final_summary])}')

Max input tokens allowed per API call = 23808
Total tokens in final summary = 4046


##### Persist final summary to local disk

In [41]:
with open(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/VAI/final-summary.txt', 'w') as f:
    f.write(final_summary)

##### Create a filtered summary with all the proposed changes on the `Processing of Derivative Contracts`

In [56]:
def get_summary_via_streaming_api(context: str) -> str:
    prompt = f""""Given the SUMMARY below, extract and refine all proposed changes related to the processing of derivative contracts into a separate list. Create a detailed summary with clear pointers.\n\n{context}"""
    headers = {
        "Authorization": f"Bearer {access_token[0]}",
        "Content-Type": "application/json; charset=utf-8"
    }
    
    data = {
        "inputs": [
            {
                "struct_val": {
                    "prompt": {
                        "string_val": [prompt]
                    }
                }
            }
        ],
        "parameters": {
            "struct_val": {
                "temperature": {"float_val": 0.0},
                "maxOutputTokens": {"int_val": 8192},
                "topK": {"int_val": 40},
                "topP": {"float_val": 0.8}
            }
        }
    }
    response = requests.post(STREAMING_API_URL, headers=headers, json=data)
    content = json.loads(response.content)
    output = []

    for item in content:
        try:
            text = item['outputs'][0]['structVal']['content']['stringVal'][0]
            output.append(text)
        except Exception as e:
            logger.error(f'Content error => {content}')
    output = ''.join(output)
    return output

In [57]:
proposed_changes_summary = get_summary_via_streaming_api(consolidated_summaries)
logger.info(proposed_changes_summary)

 **Section 1: Capital Requirements for Large Banking Organizations**

The banking regulators propose to revise the capital requirements for large banking organizations and those with significant trading activity. 

The proposed revisions aim to improve risk-based capital requirements, reduce complexity, enhance consistency, and facilitate effective supervisory and market assessments. 

The proposal includes replacing current requirements that use internal models for credit and operational risk with standardized approaches. 

It also replaces current market risk and credit valuation adjustment risk requirements with revised approaches. 

The proposed revisions generally align with recent changes to international capital standards issued by the Basel Committee on Banking Supervision.

**Section 2: Standardized Approach for Credit Risk**

The proposal introduces a standardized approach for credit risk applicable to large banking organizations. 

It retains many of the same definitions fro

In [58]:
MAX_OUTPUT_TOKENS = 8192
max_input_tokens = CONTEXT_LENGTH - MAX_OUTPUT_TOKENS
logger.info(f'Max input tokens allowed per API call = {max_input_tokens}')
logger.info(f'Total tokens in final summary = {get_total_tokens([proposed_changes_summary])}')

Max input tokens allowed per API call = 23808
Total tokens in final summary = 864


In [59]:
with open(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/VAI/proposed-changes-summary.txt', 'w') as f:
    f.write(proposed_changes_summary)