# NER Tagging Notebook 

This notebook takes the preprocessed legal clause data and generates BILOC tags for every clause type.

### Script Section - Imports:

In [1]:
import heapq
import json
import pickle
import os
import string

import numpy as np
import pandas as pd

### Loading Preprocessed Data

In [2]:
# Loads in json object containing all clauses per document and clause type, and their respective locations(s)
with open("./data/rft_clauses_final.json", "r") as file:
    rft_clauses_final = json.load(file)
    
# Loads list of abbreviated clause type names:
with open("./data/clause_tag_names.ob", "rb") as fp:
    ctn = pickle.load(fp)
    
# Loads in list of legal documents in string format:   
with open("./data/docs_in_str.ob", 'rb') as fp:
    docs_in_str = pickle.load(fp)

### Abbreviated Clause Name (Tag) and Index Dictionaries Creation

In [3]:
# Generation of ner_tag_index and ner_index_tag dictionaries
tags = ["OUTSIDE"]
BILOU = "BILU"
for i, tag in enumerate(ctn):
    for j, anotate_type in enumerate(BILOU):
        tags.append(anotate_type + '-' + tag)

ner_index_tag = {}

for i, ner_tag in enumerate(tags):
    ner_index_tag[i] = ner_tag
    
ner_tag_index = dict((v,k) for k,v in ner_index_tag.items())

with open("./data/ner_tag_index.ob", 'wb') as fp:
    pickle.dump(ner_tag_index, fp)
    
with open("./data/ner_index_tag.ob", 'wb') as fp: 
    pickle.dump(ner_index_tag, fp)

### Document Split Tokenization and NER Tagging

In [4]:
# Dataframe to hold ner_tags and split_tokens from documents
tagged_data_df = pd.DataFrame(columns=['id', 'ner_tags', 'split_tokens'])

# Number of Documents to Parse and Tag
N = len(rft_clauses_final)

for i in range(N):
    # Tokenized Text Documents
    # Get string form or text document from list
    text_document = docs_in_str[i]
    
    # Skip document if content == "ERROR"
    # File could not be found or read previously
    if text_document == "ERROR":
        continue
    
    # Instantiation of Token Tagging Assisting Datastructures
    location_heap = []
    clause_instances = {}
    clause_list = []
    
    # Dictionary of location of clauses within text file indexed by type
    doc_clause_locations = rft_clauses_final[str(i)]["locations"]
    # Dictionary of clauses found within document indexed by type
    doc_clauses = rft_clauses_final[str(i)]["clauses"]
    # Clause instance counter:
    c_inst = 0
    
    # Iterates over every type of clause within datset:
    # Populates Token Tagging Assisting Datastructures 
    for clause_type in ctn:
        # List of clauses in document of type clause_type
        clause_type_samples = doc_clauses[clause_type]
        
        # Move to next type if there are no examples:
        if len(clause_type_samples) == 0:
            continue
        
        # Boolean on whether clause has whitespace; used later in tagging: 
        has_ws = True
        
        # Iterate over each clause_type sample:
        for clause in clause_type_samples:
            # Add indexed clause to document clause list
            clause_list.append((clause, clause_type))
            c_index = len(clause_list) - 1
            
            # Check for whitespace in clause:
            if clause.find(' ') == -1:
                has_ws = False
            
            # Location list of clauses within text:
            clause_locations = doc_clause_locations[clause_type][clause]
            
            # Iterate over each location of clause:
            for loc in clause_locations:
                # Location_heap tuple -> (start_index,end_index,instance_key,has_whitespace)
                clause_loc_tuple = (loc[0],loc[1],c_inst,has_ws)
                location_heap.append(clause_loc_tuple)
                
                # Add instance to clause_instances dictionary with index to its sample:
                clause_instances[c_inst] = c_index
                
                # Increase clause instance count by one:
                c_inst += 1
            
            # Resets has_whitespace for next clause
            has_ws = True
    
    # Heapify list of clause location tuples
    heapq.heapify(location_heap)     
    
    # Instantiate list for tokenizing text document:
    tokenized_doc = []
    ner_tags = []
    temp = []
    
    doc_char_index = 0
    for j in range(c_inst):
        loc_info = heapq.heappop(location_heap)
        start, end, inst_key, has_ws = loc_info
        
        # Outside Tagging:
        section = text_document[doc_char_index:start].split()
        tags = [0] * len(section)
        tokenized_doc.extend(section)
        ner_tags.extend(tags)
        
        # Clause Tagging:
        section = text_document[start:end].split()
        clause_type = clause_list[clause_instances[inst_key]][1]
        if has_ws:
            # Beginning Tag:
            tags = [ner_tag_index["B-" + clause_type]]
            # Inner Tags:
            tags.extend([ner_tag_index["I-" + clause_type]] * (len(section) - 2))
            # Last Tag:
            tags.extend([ner_tag_index["L-" + clause_type]])
            
            doc_char_index = end
        else:
            # Unit Tagging:
            # If surrounding characters of a unit tag are not whitespace
            # search function picked up a "clause" within another word.
            # Thus no words should get tokenized or tags added.
            if (end == len(text_document) or text_document[end]) == ' ' and (start == 0 or text_document[start-1] == ' '):
                tags = [ner_tag_index["U-" + clause_type]]
                doc_char_index = end
            else:
                section = []
                tags = []
                
                # If false clause is the start of another clause set doc_char_index 
                # to start_index to prevent a duplication of tokenization and tagging
                if len(location_heap) != 0 and start == location_heap[0][0]:
                    doc_char_index = start
                
        # Adding Clause Tags:
        tokenized_doc.extend(section)
        ner_tags.extend(tags)
        
        temp.append(loc_info)
        if len(tokenized_doc) != len(ner_tags):
            print(i)
            break
            
    # Final Outside Tagging
    section = text_document[doc_char_index:].split()
    tags = [0] * len(section)
    tokenized_doc.extend(section)
    ner_tags.extend(tags)
    
    # Print warning if the # of tokens match the # of tags:
    if len(tokenized_doc) != len(ner_tags):
        print("Warning Token Tag Number Mismatch. Document:", i)
    
    # Add tokenized document and respective NER tags to dataframe
    tagged_data_df.loc[len(tagged_data_df.index)] = [i, ner_tags, tokenized_doc]

### Saving of Tokenized and Tagged Documents:

In [5]:
# Save dataframe to json file
tagged_data_df.to_json("./data/biloc_tagged_clauses.json", orient="table", index=False)

In [6]:
# Save clause class tags to json file
feature_class_labels = list(ner_tag_index.keys())
with open("./data/feature_class_labels.json", 'w') as f:
    json.dump(feature_class_labels, f, indent=2)