In [None]:
!pip install pdfplumber faiss-cpu

In [None]:
!python main.py "dataset__0__5.pdf"

2025-01-14 02:10:21.828010: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-14 02:10:21.852794: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-14 02:10:21.860009: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
from new_preprocess_vectordb import extract_data2, chunk_text, create_vector_db, search_vector_db

In [6]:
from openai import OpenAI
openAI_key = "your-openai-key"
client = OpenAI(api_key=openAI_key)

In [35]:
def create_prompt(context, field):
    if field == "Effective Date":
      prompt_field = "Effective Date or Start Date"
    elif field == "Expiration Date":
      prompt_field = "Expiration Date or End Date"
    else:
      prompt_field = "Companies or Organisations or Corporations"
    prompt = f"""
    You are an expert contract analyst. Extract the {prompt_field} from the following contract text.
    Contract Text:
    {context}

    Respond with ONLY and ONLY {prompt_field} with the format specified below and nothing else.
    The output should contain only the given format. If the {prompt_field} is not found reply with N/A.
    """
    if field == "Expiration Date":
      prompt+='Some times Expiration Date or End Date may not be explicitly given but it can be written for e.g "The contract expires 2 years from the effective date", which would mean Expiration data is equal to Effective date plus 2 years which you would have to figure out.'
    if prompt_field == "Effective Date or Start Date" or prompt_field == "Expiration Date or End Date":
        prompt += "Format: dd/mm/yyyy"
    elif prompt_field == "Companies or Organisations or Corporations":
        prompt += 'Format:["Party 1", "Party 2"]'
    return prompt

In [40]:
def get_llm_response(prompt):
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[{"role": "user", "content": prompt}],
    )
    return response.choices[0].message.content

In [12]:
import json
import os
import re
from openai import OpenAI


def extract_fields(pdf_path):
    llm_fields = {"Effective Date": None, "Expiration Date": None, "Parties": None}

    data = extract_data2("/content/pdfs/"+pdf_path)
    chunks = chunk_text(data)
    create_vector_db(chunks, "/content/vectorDB/" + pdf_path.split(".")[0])
    db_fields = ["Effective Date Start Date from", "Expire Date End Date to", "Company"]
    for db_field, llm_field in zip(db_fields, llm_fields.keys()):
        best_response = "N/A"
        retry = 0
        while best_response =="N/A":
            print(f"Searching for '{db_field}' in the document...")
            results = search_vector_db("/content/vectorDB/"+ pdf_path.split(".")[0], db_field)
            combined_context = "\n".join([chunk["text"] for chunk in results])
            prompt = create_prompt(combined_context, llm_field)
            response = get_llm_response(prompt)
            print(response)
            retry+=1
            if retry == 5: break
            if response != "N/A":
                best_response = response
                break

        if best_response == "N/A":
            best_response = "-"
        llm_fields[llm_field] = {"response": best_response}
    return llm_fields



In [10]:
import os
import time
folder_path = "/content/pdfs/"

def natural_key(f):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', f)]

file_names = sorted(os.listdir(folder_path), key=natural_key)


In [41]:
import time
json_file = []
time_taken_per_document = []
count = 0
for pdf_name in file_names:
    # print(pdf_name)
    start_time = time.time()  # Record the start time
    extracted_data = extract_fields(pdf_name)  # Extract fields
    end_time = time.time()  # Record the end time
    time_taken = end_time - start_time
    time_taken_per_document.append(time_taken)

    json_file.append({pdf_name: extracted_data})
    print(f"Time taken for {pdf_name}: {time_taken:.2f} seconds")
    # count+=1
    # if count ==5 :break
average_time = sum(time_taken_per_document) / len(time_taken_per_document)
print(f"\nAverage time taken per document: {average_time:.2f} seconds")

# Optional: Save the results to a JSON file
output_json = {}
for item in json_file:
    output_json.update(item)  # Merge each dictionary into the main one

# Print the result
with open("data_pred.json", "w") as outfile:
    json.dump(output_json, outfile, indent=4)

Searching for 'Effective Date Start Date from' in the document...
05/07/2024
Searching for 'Expire Date End Date to' in the document...
30/06/2025
Searching for 'Company' in the document...
["Borden & Remington Corp.", "Barnstable County"]
Time taken for dataset__0__2.pdf: 13.26 seconds
Searching for 'Effective Date Start Date from' in the document...
20/05/2024
Searching for 'Expire Date End Date to' in the document...
N/A
Searching for 'Expire Date End Date to' in the document...
20/05/2029
Searching for 'Company' in the document...
["City of Joplin, Missouri", "TAMKO Building Products LLC"]
Time taken for dataset__0__3.pdf: 6.21 seconds
Searching for 'Effective Date Start Date from' in the document...
01/03/2023
Searching for 'Expire Date End Date to' in the document...
31/01/2024
Searching for 'Company' in the document...
["Board of County Commissioners", "Fishbeck"]
Time taken for dataset__0__4.pdf: 10.04 seconds
Searching for 'Effective Date Start Date from' in the document...
01

In [42]:
with open("/content/data_pred.json", "r") as f:
        pred = json.load(f)

with open("/content/master.json", "r") as f:
        gt = json.load(f)

In [11]:
import ast
import nest_asyncio
nest_asyncio.apply()

In [43]:
total_count = 0
exact_matches = 0
for pdf_id, gt_data in gt.items():
  # print(pdf_id)
  pred_data = pred.get(pdf_id+".pdf", {})
  # print(pred_data)
  for field, gt_value in gt_data.items():
      # print(field, gt_value)
      if field == "rects" or field == "Parties": continue
      if field=="Effective Date":
        pred_value = pred_data.get("Effective Date", "")

      elif field=="Expiration Date":
        pred_value = pred_data.get("Expiration Date", "")


      gt_ans = gt_value["response"]
      print(f"""gt_ans: {gt_ans} | pred_value = {pred_value["response"]}""")
      total_count += 1
      if gt_ans == pred_value["response"] or (gt_ans == '-' and pred_value["response"]=="The contract text does not provide a specific Expiration Date or End Date."):
          exact_matches += 1

gt_ans: 01/07/2024 | pred_value = 05/07/2024
gt_ans: 30/06/2025 | pred_value = 30/06/2025
gt_ans: 20/05/2024 | pred_value = 20/05/2024
gt_ans: 30/06/2029 | pred_value = 20/05/2029
gt_ans: 01/03/2023 | pred_value = 01/03/2023
gt_ans: 31/01/2024 | pred_value = 31/01/2024
gt_ans: 01/07/2021 | pred_value = 01/07/2021
gt_ans: 30/06/2022 | pred_value = 30/06/2022
gt_ans: 29/03/2024 | pred_value = 29/03/2024
gt_ans: 31/12/2024 | pred_value = 31/12/2024
gt_ans: 28/03/2022 | pred_value = 19/05/2021
gt_ans: 31/03/2023 | pred_value = 31/03/2023
gt_ans: 01/12/2022 | pred_value = 11/01/2022
gt_ans: 30/11/2025 | pred_value = 30/11/2025
gt_ans: 01/11/2021 | pred_value = 01/11/2021
gt_ans: 31/10/2024 | pred_value = 31/10/2024
gt_ans: 24/01/2022 | pred_value = 24/01/2022
gt_ans: 23/01/2024 | pred_value = 23/01/2024
gt_ans: 16/08/2010 | pred_value = 16/08/2010
gt_ans: - | pred_value = 3/13/2011
gt_ans: 22/02/2013 | pred_value = 22/02/2013
gt_ans: - | pred_value = -
gt_ans: 12/05/2012 | pred_value = 12/0

In [44]:
exact_matches

35

In [53]:
exact_matches/50

0.7

In [25]:
def evaluate_with_llm(input, original_text, generated_text, criteria, evaluation_steps):
    """
    Uses an LLM to evaluate the correctness of a generated text based on given criteria and steps.

    Args:
        input (str): The task input prompt.
        original_text (str): The original expected text.
        generated_text (str): The actual generated text to be evaluated.
        criteria (dict): Dictionary containing the evaluation criteria.
        evaluation_steps (list): List of evaluation steps for scoring.

    Returns:
        dict: Evaluation result with a score and justification.
    """
    # Construct the evaluation prompt
    prompt = f"""
You are an expert evaluator tasked with scoring the correctness of a generated text.
Follow these instructions:

### Input Task
{input}

### Criteria
{criteria['correctness']}

### Evaluation Steps
1. {evaluation_steps[0]}
2. {evaluation_steps[1]}
3. {evaluation_steps[2]}

### Texts to Evaluate
Original Text: "{original_text}"
Generated Text: "{generated_text}"

### Task
1. Assign a score between 0 and 1, where 1 means perfect correctness and 0 means completely incorrect. You can go upto 5 decimal places for increased precision
2. Provide a detailed justification for the score based on the evaluation steps.

Respond with a JSON object like this:
{{
    "score": <score>,
    "justification": "<detailed justification>"
}}
    """

    # Use OpenAI API to get the evaluation result
    response = client.chat.completions.create(
        model="gpt-4",  # Choose the appropriate model
        messages=[
            {"role": "system", "content": "You are an expert evaluator for text-based tasks."},
            {"role": "user", "content": prompt},
        ],
        temperature=0  # Set to 0 for deterministic outputs
    )

    # Parse the response
    result = response.choices[0].message.content
    return result


In [45]:
import ast
evaluation_results = []
for pdf_id, gt_data in gt.items():
        # Extract the ground truth and predicted parties
        gt_parties = gt_data.get("Parties", [])['response']
        print(gt_parties)
        pred_parties = pred.get(pdf_id+".pdf", {}).get("Parties", [])['response']
        print(pred_parties)

        input_prompt = "What is ONLY the name of the party on the legal contract"

        original_text = " and ".join(parties for parties in gt_parties)
        generated_text = " and ".join(parties for parties in ast.literal_eval(pred_parties))
        print(generated_text)
        criteria = {
            "correctness": "Determine whether the actual output contains the same entities as the expected output."
        }

        evaluation_steps = [
            "Check whether the facts in 'actual output' has the same party as in the 'expected output'.",
            "If there is an omission of address of the company or short form of the company is written, it is OK.",
            "Penalize heavily if the 'actual output' is completely not the same as 'expected output'."
        ]

        evaluation_result = json.loads(evaluate_with_llm(input_prompt, original_text, generated_text, criteria, evaluation_steps))

        evaluation_results.append({
            "pdf_id": pdf_id,
            "ground_truth_parties": gt_parties,
            "extracted_parties": pred_parties,
            "G-Eval Score": evaluation_result["score"],
            "Justification": evaluation_result["justification"]

        })

['Barnstable County', 'Borden & Remington Corporation']
["Borden & Remington Corp.", "Barnstable County"]
Borden & Remington Corp. and Barnstable County
['City Of Joplin, Missouri', 'Tamko Building Products Limited Liability Company']
["City of Joplin, Missouri", "TAMKO Building Products LLC"]
City of Joplin, Missouri and TAMKO Building Products LLC
['Board Of County Commissioners, Lucas County, Ohio', 'Fishbeck']
["Board of County Commissioners", "Fishbeck"]
Board of County Commissioners and Fishbeck
['City Of Half Moon Bay', 'Impec Group Corporation']
["City of Half Moon Bay", "Impec Group, Inc."]
City of Half Moon Bay and Impec Group, Inc.
['Manpowergroup Us Corporation']
["ManpowerGroup", "Client"]
ManpowerGroup and Client
['Board Of County Commissioners Of Nassau County, Florida', 'Government Services Group Corporation']
["Board of County Commissioners of Nassau County, Florida", "Government Services Group, Inc."]
Board of County Commissioners of Nassau County, Florida and Governm

In [46]:
total_score = 0
for result in evaluation_results:
  total_score += result["G-Eval Score"]

In [47]:
total_score/25

0.69