# Data Preprocessing Notebook 1

This notebook aggregates a portion of the different scripts used to preprocess the CUAD dataset for use in NER token classification via LLM's.  

### Script Section 1 - Imports:

In [1]:
import numpy as np
import pandas as pd
import pickle

### Clause Type Selection

In [2]:
# Read in clause data from master clause csv file:
data = pd.read_csv("./data/master_clauses.csv")

# Determine the indexes of clause existence columns (i.e. columns with Yes/No answers)
yes_no_column_indexes = []
for i in range(83):
    if data.iloc[0][i] == "No" or data.iloc[0][i] == "Yes":
        yes_no_column_indexes.append(i)

# Create Dataframe of all clause existence columns
# Pull column name from Dataframe and convert to list
invalid_clause_tags = data.iloc[:, yes_no_column_indexes]
columns_to_drop = invalid_clause_tags.columns.to_list()

# Drop clause existence columns from original Dataframe
data = data.drop(columns_to_drop, axis=1)

# Find all Answer columns (columns containing "Answer" in title; contain altered text from documents)
answer_columns = []
for i, col in enumerate(data.columns):
    if "Answer" in col:
        answer_columns.append(col)

# Drop Answer column from original Dataframe
data = data.drop(answer_columns, axis=1)

# Save cleaned Dataframe to csv file.
data.to_csv('./data/cleaned_clauses.csv', index=False) 

print("General Clause Preprocessing Complete.")

General Clause Preprocessing Complete.


### Clause Type Name and Abbreviated Tags Processing

In [3]:
# Get the clause types to be used from data. Pulls the columns name from data Dataframe
# and converts to list ignoring 1st column (filename header)
clause_tags = data.columns.to_list()[1:]

# Reads in abbreviations of clause types from csv
# Converts entries from DataFrame to list
clause_tag_names = pd.read_csv("./data/clause_tag_names.csv")
ctn = clause_tag_names["clause tag name"].to_list()

# Maps the clause type names to their respective abbreviations:
clause_tag_map = dict(zip(ctn,clause_tags))

# Save the list of abreviated clause type names to binary file:
with open("./data/clause_tag_names.ob", 'wb') as fp: 
    pickle.dump(ctn, fp)
    
# Save the map of abreviated tags to clause type names to binary file:
with open("./data/clause_tag_map.ob", 'wb') as fp: 
    pickle.dump(clause_tag_map, fp)

print("Completed Clause Type Name and Abbreviated Tags Processing.")

Completed Clause Type Name and Abbreviated Tags Processing.


In [4]:
# Resets all current variables within notebook. Done to prevent potential mishaps
# when bringing scripts ran in isolation together.
%reset -f

### Script Section 2 - Imports:

In [5]:
import numpy as np
import pandas as pd
import pickle

import ast
import json

In [6]:
# Load in "cleaned" clauses csv
clauses = pd.read_csv("./data/cleaned_clauses.csv")

# Load in shortened clause type tag names:
with open("./data/clause_tag_names.ob", "rb") as fp:
    clause_tag_names = pickle.load(fp)

# Instantiation of ready for tagging document-clause dicionary:
rft_clauses = {}

# Initialization of Constants:
# N -> Number of documents (510)
N = len(clauses) 
# k -> Number of clause types (42)
k = len(clauses.iloc[0])

# Iterates over every document row in cleaned_clauses csv:
for i in range(N):
    # Loading document row from Dataframe to variable
    document_row = clauses.iloc[i]
    
    # Creation of individual document object
    document = {}
    # Removes".pdf" file type tag from document name and replaces with ".txt"
    doc_filename = document_row.loc["Filename"][:-4] + ".txt"
    # Adds altered document name to document object
    document["doc_filename"] = doc_filename
    
    #Creation of individual clause object
    doc_clauses = {}
    
    # Iterates over every non-yes/no clause type:
    for j in range(1,k):
        # Generate list of clauses from document clause type entry
        # Format of multiple clauses are a string form list
        # ast.literal_eval is utilized to conver to actual list
        clause_list = ast.literal_eval(clauses.iloc[i][j])
        
        # Add clause list to document clause dictionary
        # Key is shortened clause type name from clause_tag_names -> list(str)
        # Clause types start at index 1 in document row; thus j-1 for ctn which starts at index 0
        doc_clauses[clause_tag_names[j-1]] = clause_list
        
    # Add all clause types to document dictionary
    document["clauses"] = doc_clauses
    
    # Add document dictionary to rft_clauses dictionary
    doc_id = i
    rft_clauses[doc_id] = document

# Serializing json
json_object = json.dumps(rft_clauses, indent=4)
 
# Write to 'rft_clauses.json'
with open("./data/rft_clauses.json", "w") as outfile:
    outfile.write(json_object)

print("NER clause-tagging preperation complete!")

NER clause-tagging preperation complete!
