In [1]:
import pandas as pd
import numpy as np
import json
from glob import glob
import os
import re
# import matplotlib.pyplot as plt
from tqdm import tqdm
import os

# Preprocess abbreviated data to get clause categories

**all CUAD data can be downloaded here: https://www.atticusprojectai.org/cuad**

**The abbreviated clauses are available here: https://github.com/TheAtticusProject/cuad, under `data.zip`**

In [2]:
def cuad_dataset_to_dataframe(_path):
    _dataset = json.loads(open(_path,'r').read().strip())
    _alldataframes = []
    for k,row in pd.DataFrame(_dataset['data']).iterrows():
        paras = row.paragraphs[0]
        dt = pd.DataFrame(paras['qas'])
        dt['file']=row.title                 
        _alldataframes.append(dt)
    _full_data = pd.concat(_alldataframes)

    _dctpd = _full_data.answers.apply(lambda x: x[0] if x else {})
    _expanded = _dctpd.apply(pd.Series)
    all_data =  pd.concat([_full_data,_expanded],axis=1).drop(columns='answers')
    all_data = all_data.dropna(subset='text')[['question','text']].reset_index(drop=True)
    # infer clause_type from questions
    all_data['original_clause_type'] = all_data.question.apply(lambda x: re.findall('"([^"]*)"', x)[0].lower())
    # infer clause_type from questions
    all_data['clause_type'] = all_data.original_clause_type.str.replace('-',' ')
    # rename query to question
    all_data['query'] = all_data['question']
    return all_data[['clause_type','original_clause_type','query','text']]

**Did some data directory reshuffling**

(markdown because cell output takes too long)
cuad_train_data = cuad_dataset_to_dataframe('./data/train_separate_questions.json')
cuad_test_data = cuad_dataset_to_dataframe('./data/test.json')

In [4]:
len(cuad_train_data)

11180

In [5]:
len(cuad_test_data)

1244

Concatenate both train and test together

In [6]:
full_cuad_data = pd.concat([cuad_train_data, cuad_test_data])
len(full_cuad_data)

12424

In [7]:
full_cuad_data.to_csv("./data/full_abbreviated_cuad_data.csv")

In [8]:
full_cuad_data.head()

Unnamed: 0,clause_type,original_clause_type,query,text
0,document name,document name,Highlight the parts (if any) of this contract ...,DISTRIBUTOR AGREEMENT
1,parties,parties,Highlight the parts (if any) of this contract ...,Distributor
2,parties,parties,Highlight the parts (if any) of this contract ...,Electric City of Illinois L.L.C.
3,parties,parties,Highlight the parts (if any) of this contract ...,Electric City of Illinois LLC
4,parties,parties,Highlight the parts (if any) of this contract ...,Company


In [9]:
all_clause_categories = list(set(full_cuad_data["original_clause_type"]))

In [10]:
all_distinct_queries = list(set(full_cuad_data["query"]))
all_distinct_queries = [el.lower().replace("\xa0", "") for el in all_distinct_queries]

In [11]:
clause_type_to_query = {}
for clause_type in all_clause_categories:
    for query in all_distinct_queries:
        if f'"{clause_type}"' in query:
            clause_type_to_query[clause_type] = query

In [12]:
clause_type_to_query["revenue/profit sharing"] = '''
highlight the parts (if any) of this contract related to "revenue/profit sharing" that should be reviewed by a lawyer. details: is one party required to share revenue or profit with the counterparty for any technology, goods, or services?
'''.replace('\n', '')

In [48]:
len(clause_type_to_query)

41

In [13]:
import pprint

pprint.pprint(clause_type_to_query)

{'affiliate license-licensee': 'highlight the parts (if any) of this contract '
                               'related to "affiliate license-licensee" that '
                               'should be reviewed by a lawyer. details: does '
                               'the contract contain a license grant to a '
                               'licensee (incl. sublicensor) and the '
                               'affiliates of such licensee/sublicensor?',
 'affiliate license-licensor': 'highlight the parts (if any) of this contract '
                               'related to "affiliate license-licensor" that '
                               'should be reviewed by a lawyer. details: does '
                               'the contract contain a license grant by '
                               'affiliates of the licensor or that includes '
                               'intellectual property of affiliates of the '
                               'licensor?',
 'agreement date': 'highlig

# Get the *full* clauses, not the abbreviated ones -- already provided in `./data` folder

**Move CUAD_v1 to your local directory**

In [None]:
# Read in from the excel spreadsheets
clause_spreadsheet_dir = "./CUAD_v1/label_group_xlsx"

In [15]:
clause_spreadsheets = os.listdir(clause_spreadsheet_dir)
clause_spreadsheets

['Label Report - ROFR-ROFO-ROFN.xlsx',
 'Label Report - Covenant not to Sue_ Release of Claims.xlsx',
 'Label Report - Revenue-Profit Sharing.xlsx',
 'Label Report - Audit Rights.xlsx',
 'Label Report - Non-Disparagement.xlsx',
 'Label Report - Minimum Commitment.xlsx',
 'Label Report - No-Solicit of Employees.xlsx',
 'Label Report - Governing Law.xlsx',
 'Label Report - Source Code Escrow.xlsx',
 'Label Report - Price Restrictions.xlsx',
 'Label Report - Third Party Beneficiary.xlsx',
 'Label Report - Liquidated Damages.xlsx',
 'Label Report - Dates (Group 1).xlsx',
 'Label Report - Unlimited-All-You-Can-Eat License.xlsx',
 'Label Report - Anti-assignment, CIC (Group 3).xlsx',
 'Label Report - IP Ownership Assignment.xlsx',
 'Label Report - Termination for Convenience.xlsx',
 'Label Report - Volume Restriction.xlsx',
 'Label Report - Post-Termination Services.xlsx',
 'Label Report - Parties.xlsx',
 'Label Report - Most Favored Nation.xlsx',
 'Label Report - Joint IP Ownership.xlsx',
 

In [16]:
# Concatenate excel spreadsheets into one df
clause_dfs = []
all_column_names = []
counter = 0
for clause_spreadsheet in tqdm(clause_spreadsheets):
    
    # few exceptions because the names are formatted differently
    if clause_spreadsheet == "Label Report - ROFR-ROFO-ROFN.xlsx":
        
        df = pd.read_excel(os.path.join(clause_spreadsheet_dir, clause_spreadsheet), index_col=0)
        df = df.rename(columns={"ROFR-ROFO-ROFN": "rofr/rofo/rofn"})
        column_name = "rofr/rofo/rofn"
        # queries = [clause_type_to_query[column_name]]*len(df)
        
        clause_df = pd.DataFrame(
            {
                "clause_type": column_name.lower(),
                "query": clause_type_to_query[column_name],
                "raw_text": list(df[column_name])
            }
        )
        
        clause_dfs.append(clause_df)
        
    elif clause_spreadsheet == "Label Report - Unlimited-All-You-Can-Eat License.xlsx":
        
        df = pd.read_excel(os.path.join(clause_spreadsheet_dir, clause_spreadsheet), index_col=0)
        df = df.rename(columns={"Unlimited/All-You-Can-Eat License": "unlimited/all-you-can-eat-license"})
        column_name = "unlimited/all-you-can-eat-license"
        # queries = [clause_type_to_query[column_name]]*len(df)
        
        clause_df = pd.DataFrame(
            {
                "clause_type": column_name.lower(),
                "query": clause_type_to_query[column_name],
                "raw_text": list(df[column_name])
            }
        )
        
        clause_dfs.append(clause_df)
    
    elif clause_spreadsheet == "Label Report - Revenue-Profit Sharing.xlsx":
        
        df = pd.read_excel(os.path.join(clause_spreadsheet_dir, clause_spreadsheet), index_col=0)
        df = df.rename(columns={"Revenue-Profit Sharing": "revenue/profit sharing"})
        column_name = "revenue/profit sharing"
        
        clause_df = pd.DataFrame(
            {
                "clause_type": column_name.lower(),
                "query": clause_type_to_query[column_name],
                "raw_text": list(df[column_name])
            }
        )
        
        clause_dfs.append(clause_df)
    
    else:
        print(f"reading {os.path.join(clause_spreadsheet_dir, clause_spreadsheet)}")
        df = pd.read_excel(os.path.join(clause_spreadsheet_dir, clause_spreadsheet), index_col=0)
        print(len(df))
        counter += len(df)
        column_names = list(df.columns)
        # column_names = [el.lower() for el in list(df.columns)]

        for column_name in column_names:
            if column_name.lower() in all_clause_categories:
                queries = [clause_type_to_query[column_name.lower()]]*len(df)

                clause_df = pd.DataFrame(
                    {
                        "clause_type": column_name.lower(),
                        "query": clause_type_to_query[column_name.lower()],
                        "raw_text": list(df[column_name])
                    }
                )
                clause_dfs.append(clause_df)
                all_column_names.append(column_name.lower())
        #

    

 25%|███████▎                     | 7/28 [00:00<00:00, 33.46it/s]

reading ./CUAD_v1/label_group_xlsx/Label Report - Covenant not to Sue_ Release of Claims.xlsx
100
reading ./CUAD_v1/label_group_xlsx/Label Report - Audit Rights.xlsx
271
reading ./CUAD_v1/label_group_xlsx/Label Report - Non-Disparagement.xlsx
38
reading ./CUAD_v1/label_group_xlsx/Label Report - Minimum Commitment.xlsx
165
reading ./CUAD_v1/label_group_xlsx/Label Report - No-Solicit of Employees.xlsx
59
reading ./CUAD_v1/label_group_xlsx/Label Report - Governing Law.xlsx
510
reading ./CUAD_v1/label_group_xlsx/Label Report - Source Code Escrow.xlsx
13
reading ./CUAD_v1/label_group_xlsx/Label Report - Price Restrictions.xlsx
15
reading ./CUAD_v1/label_group_xlsx/Label Report - Third Party Beneficiary.xlsx
33
reading ./CUAD_v1/label_group_xlsx/Label Report - Liquidated Damages.xlsx
61
reading ./CUAD_v1/label_group_xlsx/Label Report - Dates (Group 1).xlsx


 64%|██████████████████          | 18/28 [00:00<00:00, 40.92it/s]

501
reading ./CUAD_v1/label_group_xlsx/Label Report - Anti-assignment, CIC (Group 3).xlsx
376
reading ./CUAD_v1/label_group_xlsx/Label Report - IP Ownership Assignment.xlsx
124
reading ./CUAD_v1/label_group_xlsx/Label Report - Termination for Convenience.xlsx
183
reading ./CUAD_v1/label_group_xlsx/Label Report - Volume Restriction.xlsx
82
reading ./CUAD_v1/label_group_xlsx/Label Report - Post-Termination Services.xlsx
271
reading ./CUAD_v1/label_group_xlsx/Label Report - Parties.xlsx
510
reading ./CUAD_v1/label_group_xlsx/Label Report - Most Favored Nation.xlsx
28
reading ./CUAD_v1/label_group_xlsx/Label Report - Joint IP Ownership.xlsx
46
reading ./CUAD_v1/label_group_xlsx/Label Report - Warranty Duration.xlsx


100%|████████████████████████████| 28/28 [00:00<00:00, 37.37it/s]

75
reading ./CUAD_v1/label_group_xlsx/Label Report - Document Name.xlsx
510
reading ./CUAD_v1/label_group_xlsx/Label Report - Insurance.xlsx
167
reading ./CUAD_v1/label_group_xlsx/Label Report - Licenses (Group 4).xlsx
261
reading ./CUAD_v1/label_group_xlsx/Label Report - Non-Compete, Exclusivity, No-Solicit of Customers (Group 2).xlsx
241
reading ./CUAD_v1/label_group_xlsx/Label Report - Uncapped Liability (Group 5).xlsx
275





In [17]:
len(clause_dfs)

41

In [18]:
all_cuad_data = pd.concat(clause_dfs).reset_index()
len(all_cuad_data)

9605

In [20]:
all_cuad_data = all_cuad_data.dropna()
len(all_cuad_data)

6702

In [21]:
all_cuad_data.head()

Unnamed: 0,index,clause_type,query,raw_text
0,0,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"If, during the Term, Licensor develops or obta..."
1,1,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"During the Term, Women.com agrees to give eDie..."
2,2,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"During the Term (including any renewal Term, i..."
3,3,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"Notwithstanding the foregoing, in the event th..."
4,4,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"Notwithstanding the foregoing, in the event Ex..."


In [22]:
all_cuad_data["text"] = all_cuad_data["raw_text"].apply(lambda row: ' '.join(row.replace('\n',' ').split()).lower())


In [23]:
all_cuad_data.head()

Unnamed: 0,index,clause_type,query,raw_text,text
0,0,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"If, during the Term, Licensor develops or obta...","if, during the term, licensor develops or obta..."
1,1,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"During the Term, Women.com agrees to give eDie...","during the term, women.com agrees to give edie..."
2,2,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"During the Term (including any renewal Term, i...","during the term (including any renewal term, i..."
3,3,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"Notwithstanding the foregoing, in the event th...","notwithstanding the foregoing, in the event th..."
4,4,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"Notwithstanding the foregoing, in the event Ex...","notwithstanding the foregoing, in the event ex..."


In [24]:
all_cuad_data.to_csv("./data/original_cuad_data/cuad_data_full_clauses.csv")

In [38]:
all_cuad_data = pd.read_csv("./data/original_cuad_data/cuad_data_full_clauses.csv")

In [39]:
len(all_cuad_data)

6702

# Make a train/test split -- different clause categories in train/test

In [None]:
all_cuad_data = pd.read_csv("./data/original_cuad_data/cuad_data_full_clauses.csv")

In [52]:
all_clause_types = list(all_cuad_data.groupby(["clause_type"]).count().index)
clause_type_to_label_dict = {clause_type: i for clause_type, i in zip(all_clause_types, list(range(len(all_clause_types))))}
all_cuad_data["label"] = all_cuad_data["clause_type"].map(clause_type_to_label_dict)

In [56]:
sorted_grouped_data = all_cuad_data.groupby(["clause_type"]).count().sort_values(by='label', ascending=False)
print(len(sorted_grouped_data))
sorted_grouped_data

41


Unnamed: 0_level_0,Unnamed: 0,index,query,raw_text,text,label
clause_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
document name,510,510,510,510,510,510
parties,509,509,509,509,509,509
agreement date,470,470,470,470,470,470
governing law,437,437,437,437,437,437
expiration date,413,413,413,413,413,413
effective date,390,390,390,390,390,390
anti-assignment,374,374,374,374,374,374
cap on liability,275,275,275,275,275,275
license grant,255,255,255,255,255,255
audit rights,214,214,214,214,214,214


In [59]:
test_set_categories = list(sorted_grouped_data.index)[3::4]
len(test_set_categories)

10

In [60]:
train_set_categories = list(set(list(sorted_grouped_data.index)) - set(test_set_categories))
len(train_set_categories)

31

In [62]:
train_cuad_data = all_cuad_data[all_cuad_data["clause_type"].isin(train_set_categories)]
test_cuad_data = all_cuad_data[all_cuad_data["clause_type"].isin(test_set_categories)]


In [66]:
len(train_cuad_data)

5240

In [67]:
len(test_cuad_data)

1462

In [68]:
assert len(train_cuad_data) + len(test_cuad_data) == len(all_cuad_data)
print("train/test split passed")

train/test split passed


In [101]:
train_cuad_data = train_cuad_data[["clause_type","query", "raw_text", "text", "label"]]
test_cuad_data = test_cuad_data[["clause_type","query", "raw_text", "text", "label"]]

In [None]:
if not os.path.exists('./data/cuad_data_different_clauses'):
    os.makedirs('./data/cuad_data_different_clauses')

In [102]:
train_cuad_data.to_csv("./data/cuad_data_different_clauses/train_cuad_data.csv")

In [103]:
test_cuad_data.to_csv("./data/cuad_data_different_clauses/test_cuad_data.csv")
test_cuad_data.to_csv("./data/test_sets/test_cuad_data_different_clauses.csv")

In [94]:
train_cuad_data.head()

Unnamed: 0,clause_type,query,raw_text,text,label
0,agreement date,highlight the parts (if any) of this contract ...,"October 1, 1999 (Page 1)","october 1, 1999 (page 1)",2
1,agreement date,highlight the parts (if any) of this contract ...,"7th day of September, 1999. (Page 1)","7th day of september, 1999. (page 1)",2
2,document name,highlight the parts (if any) of this contract ...,SUPPLY AGREEMENT (Page 1),supply agreement (page 1),9
3,anti-assignment,highlight the parts (if any) of this contract ...,Developer may not assign or transfer this Agre...,developer may not assign or transfer this agre...,3
4,agreement date,highlight the parts (if any) of this contract ...,"March 12,1999 (Page 1)","march 12,1999 (page 1)",2


In [95]:
test_cuad_data.head()

Unnamed: 0,clause_type,query,raw_text,text,label
0,change of control,highlight the parts (if any) of this contract ...,The BSP may after giving due consideration to ...,the bsp may after giving due consideration to ...,6
1,governing law,highlight the parts (if any) of this contract ...,This Agreement shall be governed by and interp...,this agreement shall be governed by and interp...,13
2,governing law,highlight the parts (if any) of this contract ...,"This Agreement is governed by, and will be co...","this agreement is governed by, and will be con...",13
3,post-termination services,highlight the parts (if any) of this contract ...,In the event of termination of this Agreement ...,in the event of termination of this agreement ...,29
4,covenant not to sue,highlight the parts (if any) of this contract ...,Company agrees that it will not at any time co...,company agrees that it will not at any time co...,8


## Read in train/test data to make sure everything checks out

In [108]:
train_cuad_data = pd.read_csv("./data/cuad_data_different_clauses/train_cuad_data.csv")
print(len(train_cuad_data))
train_cuad_data.head()

5240


Unnamed: 0.1,Unnamed: 0,clause_type,query,raw_text,text,label
0,0,agreement date,highlight the parts (if any) of this contract ...,"October 1, 1999 (Page 1)","october 1, 1999 (page 1)",2
1,1,agreement date,highlight the parts (if any) of this contract ...,"7th day of September, 1999. (Page 1)","7th day of september, 1999. (page 1)",2
2,2,document name,highlight the parts (if any) of this contract ...,SUPPLY AGREEMENT (Page 1),supply agreement (page 1),9
3,3,anti-assignment,highlight the parts (if any) of this contract ...,Developer may not assign or transfer this Agre...,developer may not assign or transfer this agre...,3
4,4,agreement date,highlight the parts (if any) of this contract ...,"March 12,1999 (Page 1)","march 12,1999 (page 1)",2


In [109]:
test_cuad_data = pd.read_csv("data/cuad_data_different_clauses/test_cuad_data.csv")
print(len(test_cuad_data))
test_cuad_data.head()

1462


Unnamed: 0.1,Unnamed: 0,clause_type,query,raw_text,text,label
0,0,change of control,highlight the parts (if any) of this contract ...,The BSP may after giving due consideration to ...,the bsp may after giving due consideration to ...,6
1,1,governing law,highlight the parts (if any) of this contract ...,This Agreement shall be governed by and interp...,this agreement shall be governed by and interp...,13
2,2,governing law,highlight the parts (if any) of this contract ...,"This Agreement is governed by, and will be co...","this agreement is governed by, and will be con...",13
3,3,post-termination services,highlight the parts (if any) of this contract ...,In the event of termination of this Agreement ...,in the event of termination of this agreement ...,29
4,4,covenant not to sue,highlight the parts (if any) of this contract ...,Company agrees that it will not at any time co...,company agrees that it will not at any time co...,8


In [113]:
print(len(set(train_cuad_data["clause_type"])))
set(train_cuad_data["clause_type"])

31


{'affiliate license-licensor',
 'agreement date',
 'anti-assignment',
 'audit rights',
 'competitive restriction exception',
 'document name',
 'effective date',
 'exclusivity',
 'expiration date',
 'ip ownership assignment',
 'irrevocable or perpetual license',
 'joint ip ownership',
 'license grant',
 'liquidated damages',
 'minimum commitment',
 'most favored nation',
 'no-solicit of customers',
 'no-solicit of employees',
 'non-compete',
 'non-disparagement',
 'non-transferable license',
 'notice period to terminate renewal',
 'parties',
 'renewal term',
 'revenue/profit sharing',
 'rofr/rofo/rofn',
 'source code escrow',
 'termination for convenience',
 'uncapped liability',
 'unlimited/all-you-can-eat-license',
 'volume restriction'}

In [114]:
print(len(set(test_cuad_data["clause_type"])))
set(test_cuad_data["clause_type"])

10


{'affiliate license-licensee',
 'cap on liability',
 'change of control',
 'covenant not to sue',
 'governing law',
 'insurance',
 'post-termination services',
 'price restrictions',
 'third party beneficiary',
 'warranty duration'}

In [112]:
set(test_cuad_data["clause_type"]).intersection(set(train_cuad_data["clause_type"]))

set()

## Format train datasets for cross/dual encoders

Cross encoder is trained with `[anchor, positive]` binary cross entropy loss while the dual encoder is trained `[anchor, positive, negative]` with a triplet cross entropy loss.

In [3]:
train_cuad_data = pd.read_csv("./data/cuad_data_different_clauses/train_cuad_data.csv")

In [4]:
test_cuad_data = pd.read_csv("data/cuad_data_different_clauses/test_cuad_data.csv")


Create query, positive, negative triplets from the dataframe -- the below code cells for creating the data for the dual/cross encoders is repeated many times. Sample three negatives for each positive.

In [10]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(train_cuad_data))):
    
    category = train_cuad_data.iloc[row]["clause_type"]
    
    query = train_cuad_data.iloc[row]["query"].replace("\xa0", "").lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = train_cuad_data.iloc[row]["text"].lower()
    positive_sample = ' '.join(positive_sample.replace('\n',' ').split())
    
    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = train_cuad_data[train_cuad_data["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["text"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|████████████████████████████████████████████████████████████████████████████| 5240/5240 [00:10<00:00, 487.41it/s]


In [11]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

5240


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,"october 1, 1999 (page 1)","[distributor agreement (page 1), subject to th..."
1,highlight the parts (if any) of this contract ...,"7th day of september, 1999. (page 1)",[neither this agreement nor any right created ...
2,highlight the parts (if any) of this contract ...,supply agreement (page 1),[the parties hereby acknowledge and agree that...
3,highlight the parts (if any) of this contract ...,developer may not assign or transfer this agre...,[upon [ * ] days advance notice or such shorte...
4,highlight the parts (if any) of this contract ...,"march 12,1999 (page 1)","[joint filing agreement (page 1), all rights t..."


### Dual encoder

Create `[anchor, positive, negative triplets]` for the dual encoder

In [12]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)


15720

In [13]:
triplet_train_exploded_df.head()

Unnamed: 0,anchor,positive_match,negative_match
0,highlight the parts (if any) of this contract ...,"october 1, 1999 (page 1)",distributor agreement (page 1)
1,highlight the parts (if any) of this contract ...,"october 1, 1999 (page 1)",subject to the terms and conditions of this ag...
2,highlight the parts (if any) of this contract ...,"october 1, 1999 (page 1)",ppg shanghai shall have the right to terminate...
3,highlight the parts (if any) of this contract ...,"7th day of september, 1999. (page 1)",neither this agreement nor any right created b...
4,highlight the parts (if any) of this contract ...,"7th day of september, 1999. (page 1)","neither party may assign this agreement, in wh..."


In [14]:
triplet_train_exploded_df.to_csv("./data/train_cuad_dual_encoder.csv")


### Cross encoder

Create `[query, result, label]` examples for cross encoder

In [15]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [16]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [17]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded['label'] = 0

In [18]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()


In [19]:
len(cross_encoder_train_data)

20960

In [20]:
cross_encoder_train_data.head()

Unnamed: 0,index,query,text,label
0,0,highlight the parts (if any) of this contract ...,"october 1, 1999 (page 1)",1
1,1,highlight the parts (if any) of this contract ...,"7th day of september, 1999. (page 1)",1
2,2,highlight the parts (if any) of this contract ...,supply agreement (page 1),1
3,3,highlight the parts (if any) of this contract ...,developer may not assign or transfer this agre...,1
4,4,highlight the parts (if any) of this contract ...,"march 12,1999 (page 1)",1


In [21]:
cross_encoder_train_data[5239:].head()

Unnamed: 0,index,query,text,label
5239,5239,highlight the parts (if any) of this contract ...,"thereafter, bellicum shall have consecutive se...",1
5240,0,highlight the parts (if any) of this contract ...,distributor agreement (page 1),0
5241,1,highlight the parts (if any) of this contract ...,subject to the terms and conditions of this ag...,0
5242,2,highlight the parts (if any) of this contract ...,ppg shanghai shall have the right to terminate...,0
5243,3,highlight the parts (if any) of this contract ...,neither this agreement nor any right created b...,0


In [22]:
cross_encoder_train_data.to_csv("./data/train_cuad_cross_encoder.csv")

Final check

In [23]:
train_cuad_cross_encoder = pd.read_csv("./data/train_cuad_cross_encoder.csv")

In [24]:
train_cuad_dual_encoder = pd.read_csv("./data/train_cuad_dual_encoder.csv")

In [25]:
len(train_cuad_cross_encoder)

20960

In [26]:
len(train_cuad_dual_encoder)

15720

# Make a train/test split -- same clause categories in train/test

In [5]:
from sklearn.model_selection import train_test_split

In [19]:
all_cuad_data = pd.read_csv("./data/original_cuad_data/cuad_data_full_clauses.csv")
len(all_cuad_data)

6702

In [20]:
all_cuad_data.head()

Unnamed: 0.1,Unnamed: 0,index,clause_type,query,raw_text,text
0,0,0,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"If, during the Term, Licensor develops or obta...","if, during the term, licensor develops or obta..."
1,1,1,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"During the Term, Women.com agrees to give eDie...","during the term, women.com agrees to give edie..."
2,2,2,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"During the Term (including any renewal Term, i...","during the term (including any renewal term, i..."
3,3,3,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"Notwithstanding the foregoing, in the event th...","notwithstanding the foregoing, in the event th..."
4,4,4,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"Notwithstanding the foregoing, in the event Ex...","notwithstanding the foregoing, in the event ex..."


In [21]:
train, test = train_test_split(all_cuad_data, test_size=0.21814383766)

In [24]:
len(train)

5240

In [25]:
len(test)

1462

In [26]:
if not os.path.exists("./data/cuad_data_same_clauses"):
    os.makedirs("./data/cuad_data_same_clauses")

In [27]:
len(list(set(train["clause_type"])))

41

In [28]:
len(list(set(test["clause_type"])))

41

In [29]:
train.to_csv("./data/cuad_data_same_clauses/train_cuad_data.csv")
test.to_csv("./data/cuad_data_same_clauses/test_cuad_data.csv")
test.to_csv("./data/test_sets/test_cuad_data_same_clauses.csv")


## Read in train/test data to make sure everything checks out

In [44]:
train_cuad_data = pd.read_csv("./data/cuad_data_same_clauses/train_cuad_data.csv")
print(len(train_cuad_data))
train_cuad_data.head()

5240


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,clause_type,query,raw_text,text
0,1800,1959,443,agreement date,highlight the parts (if any) of this contract ...,"March 30, 2011 (Page 1)","march 30, 2011 (page 1)"
1,1059,1149,265,governing law,highlight the parts (if any) of this contract ...,Effective legislation of the Republic of Kazak...,effective legislation of the republic of kazak...
2,3080,4407,369,change of control,highlight the parts (if any) of this contract ...,If there is a change in Control of the first p...,if there is a change in control of the first p...
3,2436,2731,213,expiration date,highlight the parts (if any) of this contract ...,This Agreement shall automatically terminate a...,this agreement shall automatically terminate a...
4,4118,5540,90,parties,highlight the parts (if any) of this contract ...,"VerticalNet, Inc. (Page 1)\n\n""VerticalNet"" (P...","verticalnet, inc. (page 1) ""verticalnet"" (page..."


In [45]:
train_cuad_data = pd.read_csv("./data/cuad_data_same_clauses/test_cuad_data.csv")
print(len(train_cuad_data))
train_cuad_data.head()

1462


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,clause_type,query,raw_text,text
0,4773,6196,87,document name,highlight the parts (if any) of this contract ...,CO-BRANDING AGREEMENT (Page 1),co-branding agreement (page 1)
1,1828,1989,473,agreement date,highlight the parts (if any) of this contract ...,"April 14, 2000 (Page 1)","april 14, 2000 (page 1)"
2,1527,1671,155,agreement date,highlight the parts (if any) of this contract ...,"March 15, 2019 (Page 1)","march 15, 2019 (page 1)"
3,2783,3409,390,renewal term,highlight the parts (if any) of this contract ...,"As provided for in this Section 1, the term of...","as provided for in this section 1, the term of..."
4,1412,1545,29,agreement date,highlight the parts (if any) of this contract ...,"September 27, 2018 (Page 5)","september 27, 2018 (page 5)"


In [50]:
print(len(set(train_cuad_data["clause_type"])))
set(train_cuad_data["clause_type"])

41


{'affiliate license-licensee',
 'affiliate license-licensor',
 'agreement date',
 'anti-assignment',
 'audit rights',
 'cap on liability',
 'change of control',
 'competitive restriction exception',
 'covenant not to sue',
 'document name',
 'effective date',
 'exclusivity',
 'expiration date',
 'governing law',
 'insurance',
 'ip ownership assignment',
 'irrevocable or perpetual license',
 'joint ip ownership',
 'license grant',
 'liquidated damages',
 'minimum commitment',
 'most favored nation',
 'no-solicit of customers',
 'no-solicit of employees',
 'non-compete',
 'non-disparagement',
 'non-transferable license',
 'notice period to terminate renewal',
 'parties',
 'post-termination services',
 'price restrictions',
 'renewal term',
 'revenue/profit sharing',
 'rofr/rofo/rofn',
 'source code escrow',
 'termination for convenience',
 'third party beneficiary',
 'uncapped liability',
 'unlimited/all-you-can-eat-license',
 'volume restriction',
 'warranty duration'}

In [51]:
test_cuad_data = pd.read_csv("./data/cuad_data_same_clauses/test_cuad_data.csv")
print(len(train_cuad_data))
train_cuad_data.head()

1462


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,clause_type,query,raw_text,text
0,4773,6196,87,document name,highlight the parts (if any) of this contract ...,CO-BRANDING AGREEMENT (Page 1),co-branding agreement (page 1)
1,1828,1989,473,agreement date,highlight the parts (if any) of this contract ...,"April 14, 2000 (Page 1)","april 14, 2000 (page 1)"
2,1527,1671,155,agreement date,highlight the parts (if any) of this contract ...,"March 15, 2019 (Page 1)","march 15, 2019 (page 1)"
3,2783,3409,390,renewal term,highlight the parts (if any) of this contract ...,"As provided for in this Section 1, the term of...","as provided for in this section 1, the term of..."
4,1412,1545,29,agreement date,highlight the parts (if any) of this contract ...,"September 27, 2018 (Page 5)","september 27, 2018 (page 5)"


In [52]:
print(len(set(test_cuad_data["clause_type"])))
set(test_cuad_data["clause_type"])

41


{'affiliate license-licensee',
 'affiliate license-licensor',
 'agreement date',
 'anti-assignment',
 'audit rights',
 'cap on liability',
 'change of control',
 'competitive restriction exception',
 'covenant not to sue',
 'document name',
 'effective date',
 'exclusivity',
 'expiration date',
 'governing law',
 'insurance',
 'ip ownership assignment',
 'irrevocable or perpetual license',
 'joint ip ownership',
 'license grant',
 'liquidated damages',
 'minimum commitment',
 'most favored nation',
 'no-solicit of customers',
 'no-solicit of employees',
 'non-compete',
 'non-disparagement',
 'non-transferable license',
 'notice period to terminate renewal',
 'parties',
 'post-termination services',
 'price restrictions',
 'renewal term',
 'revenue/profit sharing',
 'rofr/rofo/rofn',
 'source code escrow',
 'termination for convenience',
 'third party beneficiary',
 'uncapped liability',
 'unlimited/all-you-can-eat-license',
 'volume restriction',
 'warranty duration'}

## Format train datasets for cross/dual encoders

Cross encoder is trained with `[anchor, positive]` binary cross entropy loss while the dual encoder is trained `[anchor, positive, negative]` with a triplet cross entropy loss.

In [56]:
train_cuad_data = pd.read_csv("./data/cuad_data_same_clauses/train_cuad_data.csv")
test_cuad_data = pd.read_csv("./data/cuad_data_same_clauses/test_cuad_data.csv")


Sample three negatives for each positive

In [57]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(train_cuad_data))):
    
    category = train_cuad_data.iloc[row]["clause_type"]
    
    query = train_cuad_data.iloc[row]["query"].replace("\xa0", "").lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = train_cuad_data.iloc[row]["text"].lower()
    positive_sample = ' '.join(positive_sample.replace('\n',' ').split())
    
    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = train_cuad_data[train_cuad_data["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["text"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|████████████████████████████████████████████████████████████████| 5240/5240 [00:04<00:00, 1153.09it/s]


In [58]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

5240


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,"march 30, 2011 (page 1)",[the celebrity will be limited to six speaking...
1,highlight the parts (if any) of this contract ...,effective legislation of the republic of kazak...,[utk agrees to provide the following distinct ...
2,highlight the parts (if any) of this contract ...,if there is a change in control of the first p...,[the liability cap set forth under section 6.5...
3,highlight the parts (if any) of this contract ...,this agreement shall automatically terminate a...,[this agreement shall be governed by and const...
4,highlight the parts (if any) of this contract ...,"verticalnet, inc. (page 1) ""verticalnet"" (page...",[each party shall obtain and maintain commerci...


## Dual encoder

In [59]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)

15720

In [60]:
triplet_train_exploded_df.head()

Unnamed: 0,anchor,positive_match,negative_match
0,highlight the parts (if any) of this contract ...,"march 30, 2011 (page 1)",the celebrity will be limited to six speaking ...
1,highlight the parts (if any) of this contract ...,"march 30, 2011 (page 1)","february 21, 2011 (page 1)"
2,highlight the parts (if any) of this contract ...,"march 30, 2011 (page 1)",airspan warrants that during the term of this ...
3,highlight the parts (if any) of this contract ...,effective legislation of the republic of kazak...,utk agrees to provide the following distinct s...
4,highlight the parts (if any) of this contract ...,effective legislation of the republic of kazak...,"throughout the term of this agreement, the par..."


In [61]:
triplet_train_exploded_df.to_csv("./data/cuad_data_same_clauses/train_cuad_dual_encoder.csv")


## Cross encoder

In [62]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [63]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [64]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded['label'] = 0

In [65]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()


In [66]:
len(cross_encoder_train_data)

20960

In [67]:
cross_encoder_train_data.head()

Unnamed: 0,index,query,text,label
0,0,highlight the parts (if any) of this contract ...,"march 30, 2011 (page 1)",1
1,1,highlight the parts (if any) of this contract ...,effective legislation of the republic of kazak...,1
2,2,highlight the parts (if any) of this contract ...,if there is a change in control of the first p...,1
3,3,highlight the parts (if any) of this contract ...,this agreement shall automatically terminate a...,1
4,4,highlight the parts (if any) of this contract ...,"verticalnet, inc. (page 1) ""verticalnet"" (page...",1


In [68]:
cross_encoder_train_data[5239:].head()

Unnamed: 0,index,query,text,label
5239,5239,highlight the parts (if any) of this contract ...,"furthermore, without derogating from nice' und...",1
5240,0,highlight the parts (if any) of this contract ...,the celebrity will be limited to six speaking ...,0
5241,1,highlight the parts (if any) of this contract ...,"february 21, 2011 (page 1)",0
5242,2,highlight the parts (if any) of this contract ...,airspan warrants that during the term of this ...,0
5243,3,highlight the parts (if any) of this contract ...,utk agrees to provide the following distinct s...,0


In [69]:
cross_encoder_train_data.to_csv("./data/cuad_data_same_clauses/train_cuad_cross_encoder.csv")


Final check

In [70]:
train_cuad_cross_encoder = pd.read_csv("./data/cuad_data_same_clauses/train_cuad_cross_encoder.csv")


In [75]:
train_cuad_dual_encoder = pd.read_csv("./data/cuad_data_same_clauses/train_cuad_dual_encoder.csv")


In [76]:
len(train_cuad_cross_encoder)

20960

In [77]:
len(train_cuad_dual_encoder)

15720

## ApplicaAI CUAD training set

Remove the overlapping categories between `CUAD` and `ApplicaAI` datasets

In [2]:
data_path = "./data/original_cuad_data/cuad_data_applica_ai_excluded_clauses.csv"

In [3]:
df = pd.read_csv(data_path)
len(df)

5661

In [4]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,clause_type,query,raw_text,text
0,0,0,0,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"If, during the Term, Licensor develops or obta...","if, during the term, licensor develops or obta..."
1,1,1,1,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"During the Term, Women.com agrees to give eDie...","during the term, women.com agrees to give edie..."
2,2,2,2,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"During the Term (including any renewal Term, i...","during the term (including any renewal term, i..."
3,3,3,3,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"Notwithstanding the foregoing, in the event th...","notwithstanding the foregoing, in the event th..."
4,4,4,4,rofr/rofo/rofn,highlight the parts (if any) of this contract ...,"Notwithstanding the foregoing, in the event Ex...","notwithstanding the foregoing, in the event ex..."


In [10]:
df.iloc[560]["query"]

'highlight the parts (if any) of this contract related to "audit rights" that should be reviewed by a lawyer. details: does a party have the right to audit the books, records, or physical locations of the counterparty to ensure compliance with the contract?'

In [12]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(df))):
    
    category = df.iloc[row]["clause_type"]
    
    query = df.iloc[row]["query"].replace("\xa0", "").strip().lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df.iloc[row]["text"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split())
    positive_sample = positive_sample.lower()

    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df[df["text"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["text"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5661/5661 [00:08<00:00, 679.53it/s]


In [13]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

5661


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,"if, during the term, licensor develops or obta...","[during the term of this agreement, and for a ..."
1,highlight the parts (if any) of this contract ...,"during the term, women.com agrees to give edie...",[without the prior written consent of the othe...
2,highlight the parts (if any) of this contract ...,"during the term (including any renewal term, i...","[startengine crowdfunding, inc. (page 1) “comp..."
3,highlight the parts (if any) of this contract ...,"notwithstanding the foregoing, in the event th...","[subject to the terms of this agreement, ppi h..."
4,highlight the parts (if any) of this contract ...,"notwithstanding the foregoing, in the event ex...","[the parties will ensure that marketing, promo..."


### Dual encoder

In [14]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)

16983

In [15]:
triplet_train_exploded_df.head()

Unnamed: 0,anchor,positive_match,negative_match
0,highlight the parts (if any) of this contract ...,"if, during the term, licensor develops or obta...","during the term of this agreement, and for a p..."
1,highlight the parts (if any) of this contract ...,"if, during the term, licensor develops or obta...",notwithstanding anything herein to the contrar...
2,highlight the parts (if any) of this contract ...,"if, during the term, licensor develops or obta...","r. c. boyd enterprises, llc (page 1) ""company""..."
3,highlight the parts (if any) of this contract ...,"during the term, women.com agrees to give edie...",without the prior written consent of the other...
4,highlight the parts (if any) of this contract ...,"during the term, women.com agrees to give edie...","during the term, each party hereby provides a ..."


In [16]:
triplet_train_exploded_df.to_csv("./data/original_cuad_data/train_cuad_data_applica_ai_dual_encoder.csv")


### Cross encoder

In [17]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [18]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [19]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded["label"] = 0

In [20]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()

In [21]:
cross_encoder_train_data[5660:].head()

Unnamed: 0,index,query,text,label
5660,5660,highlight the parts (if any) of this contract ...,except with respect to the parties' liability ...,1
5661,0,highlight the parts (if any) of this contract ...,"during the term of this agreement, and for a p...",0
5662,1,highlight the parts (if any) of this contract ...,notwithstanding anything herein to the contrar...,0
5663,2,highlight the parts (if any) of this contract ...,"r. c. boyd enterprises, llc (page 1) ""company""...",0
5664,3,highlight the parts (if any) of this contract ...,without the prior written consent of the other...,0


In [22]:
 len(cross_encoder_train_data)

22644

In [23]:
cross_encoder_train_data.to_csv("./data/original_cuad_data/train_cuad_data_applica_ai_cross_encoder.csv")

final check

In [24]:
df1 = pd.read_csv("./data/original_cuad_data/train_cuad_data_applica_ai_dual_encoder.csv")

In [25]:
df2 = pd.read_csv("./data/original_cuad_data/train_cuad_data_applica_ai_cross_encoder.csv")

In [26]:
len(df1)

16983

In [27]:
len(df2)

22644

In [28]:
df1.columns

Index(['Unnamed: 0', 'anchor', 'positive_match', 'negative_match'], dtype='object')

In [29]:
df2.columns

Index(['Unnamed: 0', 'index', 'query', 'text', 'label'], dtype='object')

In [30]:
len(set(df1["anchor"]))

36

In [32]:
len(set(df2["query"]))

36