In [None]:

# Import required libraries for IBM Watson Machine Learning and document processing
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM


import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
# Load environment variables
load_dotenv()

In [None]:
# Set up API keys and configuration from environment variables
api_key = os.getenv("API_KEY")
ibm_cloud_url = os.getenv("IBM_CLOUD_URL")
project_id = os.getenv("PROJECT_ID")
model_id = os.getenv("MODEL_ID")
model_id

In [None]:


# Validate that necessary credentials are available
if not all([api_key, ibm_cloud_url, project_id]):
    raise ValueError("Ensure the env variables API_KEY, IBM_CLOUD_URL, and PROJECT_ID are populated correctly.")

# Prepare credentials and model parameters
creds = {
    "url": ibm_cloud_url,
    "apikey": api_key 
}

params = {
    GenParams.DECODING_METHOD: "greedy",
    GenParams.MIN_NEW_TOKENS: 30,
    GenParams.MAX_NEW_TOKENS: 3500,
    GenParams.TEMPERATURE: 0.0,
    GenParams.REPETITION_PENALTY: 1.05,
    GenParams.RANDOM_SEED: 8888,
}

model = Model(model_id=model_id, params=params, credentials=creds, project_id=project_id)


In [None]:

# Function to create a summarization prompt for legal documents
def summarize_prompt(content):
    prompt = f"""[INST] You are a lawyer representing a global company and you are needed to read a document focusing on legal terms. \
Summarize the following document into point form while retaining needed information. Split it into sections and subpoints. Do not provide false information.

Example of summary format:
1. 'Section 1'
 - subpoint
 - subpoint

2. 'Section 2'
 - subpoint
 - subpoint
 
Document:
{content}
"""
    return prompt


In [None]:
# loader1 = PdfReader("/Users/chiatecksheng/Desktop/westpac_doc_compare/documents/5831_new.pdf")
new_loader = PdfReader("./data/pdf/5831_new.pdf")
# loader1 = PdfReader("/Users/chiatecksheng/Desktop/westpac_doc_compare/documents/3301_old_test.pdf")
content_new = ""
for i in range(len(new_loader.pages)):
    text = new_loader.pages[i].extract_text()
    # print(text)
    content_new += text
    content_new += '\n'

# loader2 = PdfReader("/Users/chiatecksheng/Desktop/westpac_doc_compare/documents/5831_old.pdf")
old_loader = PdfReader("./data/pdf/5831_old.pdf")
content_old = ""
for i in range(len(old_loader.pages)):
    text = old_loader.pages[i].extract_text()
    content_old += text
    # content_old += '\n'

In [None]:

# Provide summary for ref doc
q1 = summarize_prompt(content_old)
summary_old = model.generate_text(prompt=q1)
print(summary_old)

In [None]:
# Provide summary for new doc
q2 = summarize_prompt(content_new)
summary_new = model.generate_text(prompt=q2)
print(summary_new)

In [None]:
# Define a template to compare two summaries
def compare_prompt(summary_1, summary_2):
    prompt = f"""[INST] You are a lawyer representing a global company and you are needed to read a document focusing on legal terms. Given the following summaries of a New and Old document, compare the two summaries. \
Lets think step by step and list out the sections that are different and be specific in stating what are the differences. Do not provide false information. \
Output in the same format with comparison with the old document.

New Document:
{summary_1}

Old Document:
{summary_2}

Differences: [/INST]"""
    
    return prompt

In [None]:
# Run GenAI model to compare two summaries
q = compare_prompt(summary_new, summary_old)
differences = model.generate_text(prompt=q)
print(differences)

## Generating excel file for comparison

In [None]:
import pandas as pd


docs = ["3301_new", "3301_old", "5831_new", "5831_old"]

contents = []
generated_summ = []
for doc in docs:
    loader = PdfReader(f"./data/pdf/{doc}.pdf")
    content = ""
    for i in range(len(loader.pages)):
        text = loader.pages[i].extract_text()
        content += text
    contents.append(content)
    
    q = summarize_prompt(content)
    summary = model.generate_text(prompt=q)
    generated_summ.append(summary)

df = pd.DataFrame()
df["document"] = docs
df["content"] = content
df["summary"] = generated_summ
df.to_excel("./results/results.xlsx")

In [None]:
docs = ["3301_new", "3301_old", "5831_new", "5831_old"]
loader = PdfReader(f"/data/pdf/{docs[2]}.pdf")
content = ""
for i in range(len(loader.pages)):
    text = loader.pages[i].extract_text()
    content += text
    
q = summarize_prompt(content)
summary = model.generate_text(prompt=q)
print(summary)