# Workflow for predicting contract data.
### You can place contract pdf files in the Test_Files folder and recieve the contract's name, parties, effective date, and agreement date. A test set of contracts has been placed in the directory for ease of use.
## Make sure all of the needed packages are installed and run the whole notebook.

In [1]:
#install required packages
!pip install PyMuPDF
!pip install datasets
!pip install spacy 













In [17]:
#import needed libraries
import os, re, math, random, json, string, csv

import pandas as pd
import numpy as np
from tqdm import tqdm
from IPython.display import display, HTML

import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification, PreTrainedModel, RobertaTokenizerFast

from datasets import load_dataset, ClassLabel, Sequence 

import fitz

import spacy
from spacy.lang.en import English

from collections import defaultdict

#resolve any conflicting libraries
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
# RANDOM SEED FOR REPRODUCIBILITY
RANDOM_SEED = 42

# BATCH SIZE
# IDEALLY USE SAME BATCH SIZE FOR INFERENCE AS WAS USED FOR TRAINING
BATCH_SIZES = 2

# WHICH PRE-TRAINED TRANSFORMER TO FINE-TUNE?
MODEL_CHECKPOINT = "roberta-base"

In [3]:
FEATURE_CLASS_LABELS = "feature_class_labels.json"
TEMP_MODEL_OUTPUT_DIR = 'temp_model_output_dir'
SAVED_MODEL = f"C964v2-NER-Fine-Tune-Transformer-Final-{MODEL_CHECKPOINT}" # Change for notebook version
TEST_FILE_PATH = "Test_Files/"
TEST_DATA_FILE = 'test_data_file.json'
CSV_DATA_FILE = 'legal_agreement_data_file.csv'

In [4]:
#walk through PDF files and create dataframe with the names of the files
pdf_files = []
for (dirpath, dirnames, filenames) in os.walk(TEST_FILE_PATH):
    pdf_files.extend(filenames)
#remove any hidden files in directory
for i, f in enumerate(pdf_files):
    if f.startswith("."):
        pdf_files.pop(i)
print(f"Uploaded {len(pdf_files)} legal agreements from {TEST_FILE_PATH} folder: ", pdf_files)

Uploaded 5 legal agreements from Test_Files/ folder:  ['CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf', 'CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf', 'DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf', 'LinkPlusCorp_20050802_8-K_EX-10_3240252_EX-10_Affiliate Agreement.pdf', 'SouthernStarEnergyInc_20051202_SB-2A_EX-9_801890_EX-9_Affiliate Agreement.pdf']


In [5]:
#text cleaning function for standard PDF parsing workflow
def pre_process_doc_common(text):
    text = text.replace("\n", " ")  #replacement for "\n"   
    text = text.replace("\xa0", " ")  #replacement for "\xa0"
    text = text.replace("\x0c", " ")  #replacement for "\x0c"
    
    regex = "\ \.\ "
    subst = "."
    text = re.sub(regex, subst, text, 0)  #remove multiple dots
        
    regex = "_"
    subst = " "
    text = re.sub(regex, subst, text, 0)  #remove underscores
       
    regex = "--+"
    subst = " "
    text = re.sub(regex, subst, text, 0)   #remove multiple dashes
        
    regex = "\*+"
    subst = "*"
    text = re.sub(regex, subst, text, 0)  #remove multiple stars
        
    regex = "\ +"
    subst = " "
    text = re.sub(regex, subst, text, 0)  #remove multiple whitespace
    
    text = text.strip()  #remove whitespace
    return text

In [6]:
#function takes in file list, reads each file, cleans the text and returns all agreements in a list
def text_data(test_dir, pdf_files, print_text=False, clean_text=True, max_len=3000):
    text_list = []
    for filename in tqdm(pdf_files):
        agreement = fitz.open(test_dir+filename)
        full_text = ""
        for page in agreement:
            full_text += page.get_text('text')#+"\n"
        if print_text:
            print("Text before cleaning: \n", full_text)

        #run text through cleansing function
        if clean_text:
            full_text = pre_process_doc_common(full_text)
        short_text = full_text[:max_len]
        len_text = len(short_text)

        if print_text:
            print("Text after cleaning: \n", short_text)

        text_list.append([filename, full_text, short_text, len_text])
        
    return text_list

In [7]:
#call cleaning functions on list of PDF files in testing folder
test_dir = TEST_FILE_PATH
data = text_data(test_dir, pdf_files, print_text=False, clean_text=True, max_len=1000)

#create dataframe with text
columns = ['File_Name','Full_Text', 'Short_Text', 'Length_Of_Short_Text']
text_df = pd.DataFrame(data=data, columns=columns)

100%|██████████| 5/5 [00:00<00:00, 22.02it/s]


In [8]:
#tokenize each agreement prior to bringing into the transformer model
#create tokens using spaCy
nlp = English()
text_df['tokens'] = text_df['Short_Text'].apply(lambda x: nlp(x))

#split tokens into a list ready for CSV
text_df['split_tokens'] = text_df['tokens'].apply(lambda x: [tok.text for tok in x])

#create dummy NER tags for alignment
text_df['dummy_ner_tags'] = text_df['tokens'].apply(lambda x: [0 for tok in x])

#serialise data to JSON for archive
export_columns = ['split_tokens', 'dummy_ner_tags']
export_df = text_df[export_columns]
export_df.to_json(TEST_DATA_FILE, orient="table", index=False)
text_df = text_df.drop(['dummy_ner_tags'], axis=1)

#re-import serialized JSON data and create dataset in transformer format
data_files = TEST_DATA_FILE
datasets = load_dataset('json', data_files=data_files, field='data')
print(datasets)

Downloading and preparing dataset json/default to C:/Users/adria/.cache/huggingface/datasets/json/default-7582f4fb73183989/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/adria/.cache/huggingface/datasets/json/default-7582f4fb73183989/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['dummy_ner_tags', 'split_tokens'],
        num_rows: 5
    })
})


In [9]:
#open label list that was created in pre-processing
with open(FEATURE_CLASS_LABELS, 'r') as f:
    label_list = json.load(f)

In [10]:
#instantiate tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)

In [11]:
#function that executes roBERTa tokenizer and aligns the tokens with the labels
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["split_tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["dummy_ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            #special tokens have word_idx = set label to -100 to ignore in loss function
            if word_idx is None:
                label_ids.append(-100)
            #set label for first token of each word
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            #for other tokens in a word, set the label to either current label or -100 based on label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
#use the map method of our dataset object to apply tokenize_and_align_labels to the training and validation sets

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True, load_from_cache_file=True)

#dataset produces warning when using cached files, pass load_from_cache_file=False to preprocess again

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [14]:
#Load the model and instantiate
loaded_model = AutoModelForTokenClassification.from_pretrained('C964v2-NER-Fine-Tune-Transformer-roberta-base')

#define model arguments
args = TrainingArguments(output_dir = TEMP_MODEL_OUTPUT_DIR,
                         per_device_train_batch_size=BATCH_SIZES,
                         per_device_eval_batch_size=BATCH_SIZES,
                         seed=RANDOM_SEED
                        )

#data collator makes batches samples and makes them all the same size
data_collator = DataCollatorForTokenClassification(tokenizer)

#instantiate predictor
pred_trainer = Trainer(
    loaded_model,
    args,
    data_collator=data_collator,
    tokenizer=tokenizer)

In [15]:
#extract predictions
predictions, labels, _ = pred_trainer.predict(tokenized_datasets["train"])
predictions = np.argmax(predictions, axis=2)
text_df['predictions'] = list(predictions)

#remove special tokens
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
text_df['true_predictions'] = true_predictions

#consolidate information into DataFrame
def data_extract(tuple_list):
    de_list = []
    for tup in tuple_list:
        if tup[1] != 'O':
            de_list.append(tup)
    return de_list

text_df['check_pred'] = list(list(zip(a,b)) for a,b in zip(text_df['split_tokens'], text_df['true_predictions']))
text_df['data_tuples'] = text_df['check_pred'].apply(data_extract)

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: dummy_ner_tags, split_tokens. If dummy_ner_tags, split_tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5
  Batch size = 2
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Running the code below will allow the user to see the values that the model predicted.

In [16]:
#functions to extract each important data point based on the model's labeling of tokens
def extract_agreement_date(tuple_list):
    temp_date=0
    for d in tuple_list:
        if d[1] == "B-AGMT_DATE":
            temp_date=d[0]
        elif d[1] == "I-AGMT_DATE":
            temp_date = temp_date + " " + d[0]
        else:
            continue
    if temp_date == 0:
        return 'N/A'
    return temp_date

text_df['agmt_date'] = text_df['data_tuples'].apply(extract_agreement_date)

def extract_effective_date(tuple_list):
    temp_date=0
    for d in tuple_list:
        if d[1] == "B-EFF_DATE":
            temp_date=d[0]
        elif d[1] == "I-EFF_DATE":
            temp_date = temp_date + " " + d[0]
        else:
            continue
    if temp_date == 0:
        return 'N/A'
    return temp_date

text_df['eff_date'] = text_df['data_tuples'].apply(extract_effective_date)

def extract_agreement_name(tuple_list):
    for n in tuple_list:
        if n[1] == "B-DOC_NAME":
            temp_name=n[0]
        elif n[1] == "I-DOC_NAME":
            temp_name = temp_name + " " + n[0]
        else:
            continue
    return temp_name

text_df['agmt_name'] = text_df['data_tuples'].apply(extract_agreement_name)

def extract_agreement_parties(tuple_list):
    data_dict = defaultdict(list)
    for i, p in enumerate(tuple_list):
        if p[1] == "B-PARTY":
            temp_party=p[0]
            if i == (len(tuple_list)-1):
                data_dict["Parties"].append(temp_party)
            elif tuple_list[i+1][1] != "I-PARTY":
                data_dict["Parties"].append(temp_party)
        elif p[1] == "I-PARTY":
            temp_party = temp_party + " " + p[0]
            if i == (len(tuple_list)-1):
                data_dict["Parties"].append(temp_party)
            elif tuple_list[i+1][1] != "I-PARTY":
                data_dict["Parties"].append(temp_party)

    return list(dict.fromkeys(data_dict['Parties']))

text_df['agmt_parties'] = text_df['data_tuples'].apply(extract_agreement_parties)

#create and sort dataframe with needed information
export_df = text_df[['File_Name', 'agmt_name','agmt_parties','agmt_date','eff_date', 'Full_Text']].copy()
export_df = export_df.sort_values('File_Name', axis=0)

#display predictions dataframe
export_df.head()

Unnamed: 0,File_Name,agmt_name,agmt_parties,agmt_date,eff_date,Full_Text
0,CreditcardscomInc_20070810_S-1_EX-10.33_362297...,CHASE AFFILIATE AGREEMENT,"[Chase Bank USA , N.A., Chase, an, “, Affiliat...","April 6 , 2007",,"Exhibit 10.33 Last Updated: April 6, 2007 CHAS..."
1,CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...,Marketing Affiliate Agreement,"[Birch First Global Investments Inc., Mount Kn...",8th day of May 2014,,Exhibit 10.27 MARKETING AFFILIATE AGREEMENT Be...
2,DigitalCinemaDestinationsCorp_20111220_S-1_EX-...,NETWORK AFFILIATE AGREEMENT,"[DIGITAL CINEMA DESTINATIONS CORP ., National ...","14th day of March , 2011",,DIGITAL CINEMA DESTINATIONS CORP. NETWORK AFFI...
3,LinkPlusCorp_20050802_8-K_EX-10_3240252_EX-10_...,AFFILIATE AGREEMENT,"[Link Plus Corporation, Axiometric , LLC, Axio...","JULY 15 , 2005",,"EXHIBIT 10.1 AFFLIATE AGREEMENT DATED JULY 15,..."
4,SouthernStarEnergyInc_20051202_SB-2A_EX-9_8018...,Affiliate Program Management and Conditions,[],,,Exhibit 10.8 Affiliate Program / Premium Affil...


### Formatted display of first contract in dataframe.

In [17]:
#display first sample
sample=0
print("File Name: \t\t",export_df.iloc[sample][0])
print("Agreement Name: \t",export_df.iloc[sample][1])
print("Agreement Date: \t",export_df.iloc[sample][3])
print("Agreement Date: \t",export_df.iloc[sample][4])
print("Agreement Parties:")
for p in export_df.iloc[sample][2]:
    print("\t\t\t", p)

File Name: 		 CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf
Agreement Name: 	 CHASE AFFILIATE AGREEMENT
Agreement Date: 	 April 6 , 2007
Agreement Date: 	 N/A
Agreement Parties:
			 Chase Bank USA , N.A.
			 Chase
			 an
			 “
			 Affiliate
			 Chase.com
