# Data Preprocessing Notebook 2

This notebook aggregates a portion of the different scripts used to preprocess the CUAD dataset for use in NER token classification via LLM's.  

### Script Section - Imports:

In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import os

import pybmoore
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

### Data Loading

Loads data saved from previous preprocessing notebook

In [3]:
# Loads json file containing reorganized document clauses:
# See below cell for schema
with open("./data/rft_clauses.json", "r") as file:
    rft_clauses = json.load(file)
    
# Loads list of abbreviated clause type names:     
with open("./data/clause_tag_names.ob", "rb") as fp:
    ctn = pickle.load(fp)

In [2]:
"""
JSON SCHEMA
rft_clauses:
    document_index:
        doc_filename:
        clauses:
            clause_type1: clause_list
            clause_type2: clause_list
                .
                .
                .
            clause_type41: clause_list
        locations:
            clause_type1: clause_Loc_dict
            clause_type2: clause_Loc_dict
                .
                .
                .
            clause_type41: clause_Loc_dict
"""
print()




### Document Clause Position Search

In [2]:
N = len(rft_clauses)
dataset_filepath = 'C:\\Users\\john\\LLM_NER_DATA\\CUAD_v1'
text_doc_folder = 'full_contract_txt'
text_doc_filepath = os.path.join(dataset_filepath, text_doc_folder)

# Files that could not be found or read correctly with the error type
# Contains tuple -> (filename, error_type)
file_error_type = []

print("Starting File Retrieval")
for i in range(N):
    # File Retrieval Section:
    index = str(i)
    filename = rft_clauses[index]["doc_filename"]
    
    # Handles file not found & read file errors
    try:
        # Attempts to open file at specified location
        with open(os.path.join(text_doc_filepath,filename), 'r') as file:
                # Attempts to read file into string removing & replacing '\n' with ' '
                doc_text = file.read().replace('\n', ' ')
    except Exception as e:
        file_error_type.append((filename,type(e)))
        continue
        
    # Text File Clause Search Section:
    # Document Clause Dictionary: Indexed via clause tag types:
    doc_clauses = rft_clauses[index]['clauses']
    
    # Clause Location Dictionary Object Instaniation:
    doc_clauses_loc = {}
    for tag_type in ctn:
        # List of clause samples from text
        tag_type_samples = doc_clauses[tag_type]
        
        if len(tag_type_samples) == 0:
            continue
        
        # Utilize Boyer-Moore algorithm to find every instance of each clause sample
        # Returns dictionary of start,end index tuples for each sample
        results = pybmoore.search_m(tag_type_samples, doc_text, ProcessPoolExecutor)
        
        # Add location of clause samples in text document to dictionary
        doc_clauses_loc[tag_type] = results
    
    # Add Clause Location Dictionary to Document Dictionary
    rft_clauses[index]['locations'] = doc_clauses_loc
    
    print(index, end=",")

Starting File Retrieval
0,1,2,3,5,6,7,8,9,10,11,12,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,38,39,40,42,43,44,45,46,48,49,50,51,52,53,54,55,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,117,118,119,120,121,122,124,125,126,127,130,132,133,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,154,155,156,157,158,159,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,28

### Saving of Clause Positions to JSON File

In [3]:
# Serializing json
json_object_final = json.dumps(rft_clauses, indent=4)
 
# Write to 'rft_clauses.json'
with open("./data/rft_clauses_final.json", "w") as outfile:
    outfile.write(json_object_final)
    
print()
print("Clause Tagging Prep Pt. 2 Completed!")


Clause Tagging Prep Pt. 2 Completed!
