# Long Form Text Summarization using JumpStart Foundation Model  

In [3]:
%%capture 

!pip install cohere_sagemaker
!pip install pandas
!pip install tqdm

In [None]:
from sagemaker import get_execution_role
from cohere_sagemaker import CohereError
from cohere_sagemaker import Client
from sagemaker import ModelPackage
import cohere_sagemaker
from tqdm import tqdm
import pandas as pd
import numpy as np
import sagemaker
import logging
import boto3
import time
import re
import json

## Part - I: Hosting the foundation model for real-time inference 

#### I. Imports 

#### Setup Logging 

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')
logger.info(f'[Using Boto3 version: {boto3.__version__}]')

[Using SageMaker version: 2.145.0]
[Using Boto3 version: 1.26.111]


#### II. Setup essentials 

Mapping for Model Packages (initially only us-east-1 and eu-west-1 is supported)

In [7]:
model_package_map = {
    'us-east-1': 'arn:aws:sagemaker:us-east-1:865070037744:model-package/cohere-gpt-medium-v1-4-825b877abfd53d7ca65fd7b4b262c421',
    'eu-west-1': 'arn:aws:sagemaker:eu-west-1:985815980388:model-package/cohere-gpt-medium-v1-4-825b877abfd53d7ca65fd7b4b262c421'
}

In [8]:
region = boto3.Session().region_name
logger.info(f'Region = {region}')

Region = us-east-1


In [9]:
if region not in model_package_map.keys():
    raise Exception(f'Unsupported region = {region}')

In [10]:
MODEL_PACKAGE_ARN = model_package_map[region]
logger.info(f'Model package ARN = {MODEL_PACKAGE_ARN}')

Model package ARN = arn:aws:sagemaker:us-east-1:865070037744:model-package/cohere-gpt-medium-v1-4-825b877abfd53d7ca65fd7b4b262c421


In [11]:
ROLE = get_execution_role()
session = sagemaker.Session()
logger.info(f'Role = {ROLE}')

Role = arn:aws:iam::571744842822:role/service-role/AmazonSageMaker-ExecutionRole-20230311T222368


In [12]:
timestamp = int(time.time())
MODEL_NAME = f'cohere-medium-{timestamp}'

#### III. Create a SageMaker endpoint for real-time inference 

In [13]:
model = ModelPackage(role=ROLE, 
                     model_package_arn=MODEL_PACKAGE_ARN, 
                     sagemaker_session=session, 
                     name=MODEL_NAME)
model.__dict__

{'model_data': None,
 'image_uri': None,
 'predictor_cls': None,
 'env': {},
 'name': 'cohere-medium-1682210141',
 '_base_name': None,
 'sagemaker_session': <sagemaker.session.Session at 0x7f651f94b750>,
 'role': 'arn:aws:iam::571744842822:role/service-role/AmazonSageMaker-ExecutionRole-20230311T222368',
 'vpc_config': None,
 'endpoint_name': None,
 '_is_compiled_model': False,
 '_compilation_job_name': None,
 '_is_edge_packaged_model': False,
 'inference_recommender_job_results': None,
 'inference_recommendations': None,
 '_enable_network_isolation': False,
 'model_kms_key': None,
 'image_config': None,
 'entry_point': None,
 'source_dir': None,
 'dependencies': [],
 'git_config': None,
 'container_log_level': 20,
 'bucket': None,
 'key_prefix': None,
 'uploaded_code': None,
 'repacked_model_data': None,
 'algorithm_arn': None,
 'model_package_arn': 'arn:aws:sagemaker:us-east-1:865070037744:model-package/cohere-gpt-medium-v1-4-825b877abfd53d7ca65fd7b4b262c421',
 '_created_model_packag

In [14]:
NUM_INSTANCES = 1
INSTANCE_TYPE = 'ml.g5.xlarge'

In [15]:
%%time

model.deploy(NUM_INSTANCES, 
             INSTANCE_TYPE, 
             endpoint_name=MODEL_NAME)

Creating model with name: cohere-medium-1682210141
CreateModel request: {
    "ModelName": "cohere-medium-1682210141",
    "ExecutionRoleArn": "arn:aws:iam::571744842822:role/service-role/AmazonSageMaker-ExecutionRole-20230311T222368",
    "Containers": [
        {
            "ModelPackageName": "arn:aws:sagemaker:us-east-1:865070037744:model-package/cohere-gpt-medium-v1-4-825b877abfd53d7ca65fd7b4b262c421"
        }
    ],
    "EnableNetworkIsolation": true
}
Creating endpoint-config with name cohere-medium-1682210141
Creating endpoint with name cohere-medium-1682210141


----------!CPU times: user 191 ms, sys: 9.7 ms, total: 201 ms
Wall time: 5min 33s


## Part 2: Long-form abstractive text summarization of legal judgement docs

#### I. Read, parse and chunk docs 

In [16]:
doc_name = '5.txt'

In [17]:
cleaned_lines = []
with open(f'./docs/{doc_name}', encoding='iso-8859-1') as doc:
    for line in doc.readlines():
        line = line.strip()
        line = re.sub(' +', ' ', line)
        line = line.replace('\n', '')
        line = line.replace('\t', '')
        line = line.replace('  ', ' ')
        if len(line) > 0:
            cleaned_lines.append(line)

In [18]:
doc = ' '.join(cleaned_lines)
doc = doc.split()
len(doc)

4168

In [19]:
chunk_size = 768
chunks = [' '.join(doc[i:i+chunk_size]) for i in range(0, len(doc), chunk_size)]

In [20]:
len(chunks)

6

In [21]:
chunks[0]

'Civil Appeal No. 8 of 1951. Appeal from the judgment and decree dated 12th October, 1944, of the High Court of Judicature at Allahabad (Allsop and Malik JJ.)in First Appeal No. 374 of 1941 arising out of a Decree dated 31st July, 1941, of the Court of the Civil Judge, Moradabad, in Original Suit No. 9 of 1941. Bakshi Tek Chand (section K. Kapoor, with him) for the appel lant. Achhru Ram (Jwala Prasad, with him) for the respondent. February 22. The judgment of the Court was deliv ered by BoSE J. This is a litigation between two branches of a family whose common ancestor was one Megh Raj Singh The family tree is as follows: Megh Raj Singh Jawahar Singh Madan Singh Shankar Lal(d 1884) Brijlal (d. 1889 or (1890) Daughter: Met. Mohan Dei (d. Oct 1929) Kishan Lal Mahabir Prasad Husband: Narain Das (d. 21 5 1940) (d. 1921) Shri Kishan Das Mst. Deoki Jugal Kishore Amar Nath (d.march 1929) (d. 1894) Plff. 1 Plff.2. Dhiyan Singh Jai Bhagwan Singh Deft. 1 Deft. 2 Ghas Ram Onkar Prasad The disput

#### II. Short-form Abstractive Text Summarization

In [22]:
ENDPOINT_NAME = 'cohere-medium-1682210141'
# ENDPOINT_NAME = MODEL_NAME

In [26]:
client = Client(endpoint_name=ENDPOINT_NAME)

In [27]:
summaries_by_chunks = []

In [28]:
%%time


for chunk in tqdm(chunks):
    prompt = f'Context = {chunk}\nSummarize the above context.'
    response = client.generate(prompt=prompt, 
                           max_tokens=256, 
                           temperature=0.2, 
                           return_likelihoods='GENERATION')
    generated_text = response.generations[0].text
    summaries_by_chunks.append(generated_text)


100%|██████████| 6/6 [00:43<00:00,  7.33s/it]

CPU times: user 38.3 ms, sys: 11 ms, total: 49.3 ms
Wall time: 44 s





In [29]:
cleaned_summaries = []
STOP_SEQ = '. '

In [30]:
def clean_summary(summary):
    valid_sents = []
    sents = summary.split(STOP_SEQ)
    last_sent = sents[-1]
    if not last_sent.endswith('.'):
        sents = sents[0:-2]
    return ' '.join(sents)

In [31]:
for summary in summaries_by_chunks:
    summary = summary.replace('\n', '')
    summary = summary.replace('  ', ' ')
    summary = summary.replace('\'', '')
    summary = summary.strip()
    cleaned_summary = clean_summary(summary)
    if not cleaned_summary.endswith('.'):
        cleaned_summary = cleaned_summary + '.'
    if len(cleaned_summary) >= 64:  # atleast 64 chars
        cleaned_summaries.append(cleaned_summary)

In [32]:
logger.info(f'Total number of short summaries generated = {len(cleaned_summaries)}')

Total number of short summaries generated = 6


In [33]:
cleaned_summaries

['The context is a civil appeal filed by two branches of a family whose common ancestor was Megh Raj Singh The family tree is as follows: Megh Raj Singh Jawahar Singh Madan Singh Shankar Lal(d 1884) Brijlal (d 1889 or (1890) Daughter: Met Mohan Dei (d Oct 1929) Kishan Lal Mahabir Prasad Husband: Narain Das (d 21 5 1940) (d 1921) Shri Kishan Das Mst Deoki Jugal Kishore Amar Nath (d.march 1929) (d 1894) Plff 1 Plff.2 Dhiyan Singh Jai Bhagwan Singh Deft 1 Deft 2 Ghas Ram Onkar Prasad The dispute is about property which, according to the plaintiffs, formed part of Shanker Lal s estate The plain tiffs state that the two branches of the family were separate at all material times; that on 480 Shanker Lal s death in 1884 his daughter Mst.',
 'The context is a legal case where the parties are Brijlal, Shanker Lal, Kishan Lal, and Mohan Dei Brijlal claimed that he was joint with Shanker Lal and so, on Shanker Lal s death, he became entitled to a share of the property Shanker Lals death occurred 

#### III. Question Generation

In [34]:
questions_map = {}
total_questions_generated = 0

In [35]:
detect_words = ['why', 'how', 'what', 'who', 'where', 'is', 'when', 'which', 'whose', 'are', 'do', 'does', 'can', 'could', 'should', 'will', 'have', 'has']

def is_a_question(question):
    first_word = question.split()[0]
    if first_word.lower() in detect_words:
        return True
    return False

In [36]:
%%time

for summary in tqdm(cleaned_summaries):
        prompt = f"""EXTRACT QUESTIONS
        Context: 
        {summary}
        Questions:
        """
        try:
            response = client.generate(prompt=prompt, 
                                   max_tokens=512, 
                                   temperature=0, 
                                   return_likelihoods='GENERATION')
            generated_text = response.generations[0].text
            questions = generated_text.split('\n')
            cleaned_questions = set()
            for question in questions:
                if len(question) > 5:
                    question = re.sub(r'\d+\.', '', question)
                    question = question.replace('Q:', '')
                    question = question.strip()
                    if is_a_question(question) is True:
                        cleaned_questions.add(question)
            total_questions_generated += len(cleaned_questions)
            questions_map[summary] = cleaned_questions
        except Exception:
            pass

100%|██████████| 6/6 [01:22<00:00, 13.70s/it]

CPU times: user 33.2 ms, sys: 4.66 ms, total: 37.9 ms
Wall time: 1min 22s





In [37]:
logger.info(f'Total questions generated = {total_questions_generated}')

Total questions generated = 14


#### 4. Abstractive Question & Answering

In [38]:
%%time

qa_pairs = []

for context, questions in tqdm(questions_map.items()):
    for question in questions:
        prompt = f"""Context = {context}
        Question = {question}
        Answer = 
        """
        try:
            response = client.generate(prompt=prompt, 
                               max_tokens=128, 
                               temperature=0, 
                               return_likelihoods='GENERATION')

            generated_text = response.generations[0].text
            answer = generated_text.strip()
            qa_pairs.append((doc_name, context, question, answer))
        except Exception:
            pass

100%|██████████| 6/6 [00:48<00:00,  8.04s/it]

CPU times: user 62.2 ms, sys: 792 µs, total: 63 ms
Wall time: 48.3 s





#### 5. Combine short summaries into a long form summary

In [39]:
long_form_summary = []
for short_summary in cleaned_summaries:
    short_summary = short_summary.replace('\'', '')
    long_form_summary.append(short_summary)
long_form_summary = '\n\n'.join(long_form_summary)

#### 6. Write long form summary and QA pairs to disk

In [40]:
with open(f'./summaries/summary_5.txt', 'w') as out:
    out.write(long_form_summary)

#### 7. Write QA pairs to disk

In [41]:
df = pd.DataFrame(qa_pairs, columns=['doc_name', 'short_summary', 'question', 'answer'])
df.head()

Unnamed: 0,doc_name,short_summary,question,answer
0,5.txt,The context is a civil appeal filed by two bra...,What is the legal status of a civil appeal fil...,A civil appeal filed by two branches of a fami...
1,5.txt,The context is a civil appeal filed by two bra...,What is the legal status of a civil appeal fil...,A civil appeal filed by two branches of a fami...
2,5.txt,The context is a civil appeal filed by two bra...,What is the legal status of a civil appeal?,A civil appeal is a type of lawsuit in which t...
3,5.txt,The context is a legal case where the parties ...,What was the outcome?,"Shanker Lal left the property to his son, Kish..."
4,5.txt,The context is a legal case where the parties ...,Who were the parties?,"Brijlal, Shanker Lal, Kishan Lal, and Mohan De..."


In [42]:
df.to_csv('./qa_pairs/qa_pairs_5.csv', index=False)