In [1]:

# Import required libraries for IBM Watson Machine Learning and document processing
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM

import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader

# Load environment variables
load_dotenv()


True

In [2]:

# Set up API keys and configuration from environment variables
api_key = os.getenv("API_KEY")
ibm_cloud_url = os.getenv("IBM_CLOUD_URL")
project_id = os.getenv("PROJECT_ID")
model_id = os.getenv("MODEL_ID")



# Validate that necessary credentials are available
if not all([api_key, ibm_cloud_url, project_id]):
    raise ValueError("Ensure the env variables API_KEY, IBM_CLOUD_URL, and PROJECT_ID are populated correctly.")

# Prepare credentials and model parameters
creds = {
    "url": ibm_cloud_url,
    "apikey": api_key 
}

params = {
    GenParams.DECODING_METHOD: "greedy",
    GenParams.MIN_NEW_TOKENS: 30,
    GenParams.MAX_NEW_TOKENS: 3500,
    GenParams.TEMPERATURE: 0.0,
    GenParams.REPETITION_PENALTY: 1.05,
    GenParams.RANDOM_SEED: 8888,
}

model = Model(model_id=model_id, params=params, credentials=creds, project_id=project_id)


## Milvus vectorDB

In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
milvus_host = os.getenv("MILVUS_HOST", None)
milvus_port = os.getenv("MILVUS_PORT", None)
milvus_server_pem_path = os.getenv("MILVUS_SERVER_PEM_PATH", None)
milvus_server_name = os.getenv("MILVUS_SERVER_NAME", None)
milvus_user = os.getenv("MILVUS_USER", None)
milvus_password = os.getenv("MILVUS_PASSWORD", None)


In [5]:


connections.connect("default", host=milvus_host, port=milvus_port, secure=True, server_pem_path=milvus_server_pem_path, server_name=milvus_server_name,user=milvus_user,password=milvus_password)

COLLECTION_NAME = 'docs_new'
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)

if utility.has_collection(COLLECTION_NAME):
    collection = Collection(COLLECTION_NAME)
    collection.load()
else:
    fields = [
        FieldSchema(name="p_key", dtype=DataType.INT64, is_primary=True, auto_id=False),
        FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=128),
        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=128),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=25000),
        FieldSchema(name="text_emb", dtype=DataType.FLOAT_VECTOR, dim=384),
        FieldSchema(name="metadatas", dtype=DataType.JSON)
    ]
    
    schema = CollectionSchema(fields, 'docs_compare', enable_dynamic_field=True)
    
    collection = Collection(COLLECTION_NAME, schema, consistency_level="Strong")
    
    index_params = {
        "index_type": "IVF_FLAT",
        "metric_type": "L2",
        "params": {"nlist": 1024}
    }
    collection.create_index(field_name = "text_emb", index_params = index_params)

In [6]:
has = utility.has_collection(COLLECTION_NAME)
print(f"Does collection exist in Milvus: {has}")
print(f'Number of entities in collection: {collection.num_entities}')

Does collection exist in Milvus: True
Number of entities in collection: 0


In [7]:
def text_encoder(text):
    """
    Transform a sentence into embedding
    args:
        sentence: str
    return:
        sentences_embedding
    """
    model_name = 'all-MiniLM-L6-v2'
    model = SentenceTransformer(model_name)
    emb_text = model.encode(text).tolist()
    return emb_text

In [8]:
def data_processing(file_names):
    data_rows = []
    for fn in file_names:
        file = open(f"data/json/{fn}.json")
        data = json.load(file)

        not_metadata = ["title", "text"]
        # data_rows = []
        for i, dic in enumerate(data):
            heading = {}
            heading["p_key"] = i
            heading["doc_title"] = fn
            heading["metadatas"] = {}
            for k in dic:
                if k not in not_metadata:
                    heading["metadatas"][k] = dic[k]
                else:
                    heading[k] = dic[k]
                    if k == "text":
                        heading["text_emb"] = text_encoder(dic[k])
                
            data_rows.append(heading)
            
    return data_rows

In [9]:
files_names = ["new-3301-title", "new-5831-split-by-title"]
data_rows = data_processing(files_names)

collection.load()
collection.insert(data_rows)
collection.flush()

print(f'Number of entities in collection: {collection.num_entities}')

Number of entities in collection: 41


In [10]:
connections.connect("default", host=milvus_host, port=milvus_port, secure=True, server_pem_path=milvus_server_pem_path, server_name=milvus_server_name,user=milvus_user,password=milvus_password)

COLLECTION_NAME = 'docs_new'
collection = Collection(COLLECTION_NAME)
collection.load()
print(f'Number of entities in collection: {collection.num_entities}')


Number of entities in collection: 41


In [22]:
# query = "IBM warrants an IBM Program as specified in its license agreement, Services using reasonable care and skill, and an Appliance component for its specified operating environment."
query = '''The Client Originating Company may terminate this Agreement without cause on one month's notice to
the IBM Originating Company, and the IBM Originating Company may terminate this Agreement on three
months' notice to the Client Originating Company. Once terminated, no further EPs may be acquired by
any participating Client Site under the Agreement.
If Client acquired or renewed IBM Software Subscription and Support, Selected Support, or Cloud
Services, or if Client acquired or renewed a Program's license prior to the notice of termination, IBM may
either continue to provide such services or allow Client to use the Program for the remainder of the
current term(s), or give Client a prorated refund.
The Client Originating Company will be considered to have terminated this Agreement if neither it nor any
of its participating Enterprise companies have placed orders for EPs for 24 consecutive months nor have
Software Subscription and Support or Selected Support in effect.
Either of us may terminate this Agreement if the other does not comply with any of its terms, provided the
one not complying is given written notice and reasonable time to comply.
Client agrees to promptly discontinue use of and destroy all of Client's copies of a Program upon
termination of a license grant.
Any terms that by their nature extend beyond termination remain in effect until fulfilled, and apply to
respective successors and assignees.'''



# query = """An EP is subject to this Agreement when IBM accepts Client's order by i) sending an invoice or a PoE 
# including the level of authorized use, ii) making the Program or Cloud Service available, iii) shipping the 
# Appliance, or iv) providing the support, service, or solution."""

result = collection.search(
    data=[text_encoder(query)],
    anns_field="text_emb",
    param = {"metric_type": "L2"},
    limit=1,
    output_fields=["doc_title", "title", "text", "metadatas"],
    expr=f"doc_title == 'new-5831-split-by-title'",
)


for hits in result:
    print("Matched IDs: ", hits.ids)
    print("Distance to the query vector: ", hits.distances)
    print("Matched articles: ")
    for hit in hits:
        print(
            "doc_title: ", 
            hit.entity.get("doc_title"), 
            "\ntitle: ", 
            hit.entity.get("title"), 
            "\ntext: ", 
            hit.entity.get("text"), 
            "\nMetadatas", 
            hit.entity.get("metadatas")
        )

Matched IDs:  [2]
Distance to the query vector:  [0.9174755811691284]
Matched articles: 
doc_title:  new-3301-title 
title:  Termination 
text:  a. IBM may terminate Licensee's license to use a Program if Licensee fails to comply with the IPLA, TDs or acquisition agreements, such as the International Passport Advantage Agreement (IPAA). Licensee will promptly destroy all copies of the Program after license termination. Any terms that by their nature extend beyond the termination remain in effect until fulfilled and apply to successors and assignees. 
Metadatas {'page_num': 3}


In [None]:
0.40083086490631104 0.40911149978637695  0.41468530893325806

In [23]:
def para_compare_prompt(new_text, old_text):
    prompt = f"""[INST] You are a lawyer representing a global company and you are needed to read a document focusing on legal terms. \
Given the following paragraphs of a New and Old document, compare the two paragraphs. Do not provide false information.

State the similaries and differences in point form in the following output format:
Differences:
Similarities:

New Document:
{new_text}

Old Document:
{old_text}

Differences: [/INST]"""
    
    return prompt

In [None]:
# Loop for old json and find in milvus the new similar portion
# Pass to llm for comparison

file = open(f"json/old-5831-split-by-title.json")
data = json.load(file)
filter_on = 'new-5831-split-by-title'

data_dict = {
    "old_text": [],
    "new_text": [],
    "distance_score": [],
    "title_from_new": [],
    "title_from_old": [],
    "llm_result": []
}

for i, dic in enumerate(data):
    old_text = dic["text"]
    old_title = dic["title"]

    result = collection.search(
        data=[text_encoder(old_text)],
        anns_field="text_emb",
        param = {"metric_type": "L2"},
        limit=1,
        output_fields=["doc_title", "title", "text", "metadatas"],
        expr=f"doc_title == '{filter_on}'",
    )
    # print(f"Distance score: {result[0].distances}")
    # print(f"Title/Heading: {result[0][0].entity.get("title")}")
    new_text = result[0][0].entity.get("text")
    prompt = para_compare_prompt(new_text, old_text)
    differences = model.generate_text(prompt=prompt)
    # print(differences)

    # storing the data
    data_dict["old_text"].append(old_text)
    data_dict["new_text"].append(new_text)
    data_dict["distance_score"].append(result[0].distances)
    data_dict["title_from_new"].append(result[0][0].entity.get("title"))
    data_dict["title_from_old"].append(old_title)
    data_dict["llm_result"].append(differences)

    print(f"Data point {i} done!")

In [None]:
df = pd.DataFrame()
for k in data_dict:
    df[k] = data_dict[k]

In [None]:
df.to_excel("./results/3301_approach2_v2.xlsx")

In [30]:
# old_text = ''' The Client Originating Company (identified as the Originating Site in the IPAA Enrollment Form) and each of its participating Enterprise companies (identified as an Additional Site in the IPAA Enrollment Form) (together, the Client) accept this IPAA by submitting an IPAA Enrollment Form to IBM or Client's chosen IBM Business Partner.
# b. The IBM Originating Company that accepts the Client Originating Company's orders and the Client Originating Company agree to coordinate the activities of their own Enterprise under this IPAA. The Client Originating Company is responsible for compliance with the terms by all Client Sites assigned a Passport Advantage Site Number (each, a Site) under this IPAA.
# c. Enterprise means the set of legal entities that, by more than 500%, owns, are owned by, or are under common ownership with the Client Originating Company.
# d. This IPAA is effective on the date IBM accepts the initial order under this IPAA and remains in effect until the Client Originating Company or the IBM Originating Company terminates it as described in this IPAA. IBM accepts Client's orders by: i) sending a TD that includes the level of authorized use; ii) making a Program or Cloud Service available; iii) shipping an Appliance; or iv) providing the Services.
# e. IBM warrants that the Program, when used in its specified operating environment, will conform to its specifications.'''
# old_title = "Agreement Termination"
# filter_on = 'new-5831-split-by-title'

# result = collection.search(
#     data=[text_encoder(old_text)],
#     anns_field="text_emb",
#     param = {"metric_type": "L2"},
#     limit=1,
#     output_fields=["doc_title", "title", "text", "metadatas"],
#     expr=f"doc_title == '{filter_on}'",
# )
# print(f"Distance score: {result[0].distances}")
# # print(f"Title/Heading: {result[0][0].entity.get("title")}")
# new_text = result[0][0].entity.get("text")
# prompt = para_compare_prompt(new_text, old_text)
# differences = model.generate_text(prompt=prompt)
# print(differences)

# # storing the data
# # data_dict["old_text"].append(old_text)
# # data_dict["new_text"].append(new_text)
# # data_dict["distance_score"].append(result[0].distances)
# # data_dict["title_from_new"].append(result[0][0].entity.get("title"))
# # data_dict["title_from_old"].append(old_title)
# # data_dict["llm_result"].append(differences)

 - The "New Document" does not include the warranty provision for the Program that was included in the "Old Document" (point e).
- In the "New Document", the definition of Enterprise now states that it includes the set of legal entities that own, are owned by, or are under common ownership with the Client Originating Company by more than 50% (previously it was more than 500%).

Similarities:
- Both documents define the Client and how they accept the IPAA.
- Both define the role of IBM and the Client Originating Company in coordinating activities.
- Both define the duration of the IPAA.
- Both list the ways IBM accepts client orders.
- Both define the responsibility of the Client Originating Company for compliance with the terms by all Client Sites.


In [None]:
#  Differences:

# * The new document contains detailed provisions about the use of Enterprise Products (EPs) and Non-IBM EPs, while the old document focuses on the termination of the Agreement and post-termination obligations.
# * The new document includes provisions regarding the responsibilities of the parties, confidentiality, business conduct guidelines, and use of business contact and account usage information. These topics are not addressed in the old document.
# * The new document provides for specific notice periods for withdrawal of certain offerings and termination of the Agreement, while the old document only mentions the notice periods for termination without cause and for termination due to non-compliance.
# * The new document includes provisions regarding the use of third-party services and the transfer of Content, including personally identifiable information, across country borders. These topics are not addressed in the old document.

# Similarities:

# * Both documents allow the Client to terminate the Agreement without cause on notice and provide for the termination of the Agreement due to non-compliance.
# * Both documents provide for the continuation of certain services or use of the Program for the remainder of the current term(s) in case of termination of Software Subscription and Support or Selected Support.
# * Both documents provide for the discontinuation of use of and destruction of Programs upon termination of a license grant.
# * Both documents provide that any terms that by their nature extend beyond termination remain in effect until fulfilled.

In [None]:

# Placeholder for converting comparison results to DataFrame and displaying them

# Example:
# import pandas as pd
# results_data = {
#     "Old Text": [old_text],
#     "New Text": [new_text],
#     "Comparison Result": [comparison_results]
# }
# df = pd.DataFrame(results_data)
# df


In [None]:

# Placeholder for exporting results to Excel

# Example:
# df.to_excel("comparison_results.xlsx")

# Remember to replace the placeholder with your actual code to export the DataFrame.
