In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import json
import jsonlines
from glob import glob
import os
import re
from tqdm import tqdm
import os
import ipdb

# Synthetic CUAD

## Read in `jsonl`

In [32]:
data_path = "./data/cuad_data_synthetic/synthetic_contracts_cuad.jsonl"

In [33]:
all_query_clauses = []
with open(data_path, "r") as reader:
    for obj in reader:
        all_query_clauses.append(obj)

all_query_clauses = [json.loads(el) for el in all_query_clauses]

In [34]:
len(all_query_clauses)

34661

In [35]:
df = pd.DataFrame.from_dict(all_query_clauses)
df

Unnamed: 0,question,clause
0,highlight the parts (if any) of this contract ...,(e) The Company shall furnish to the Inve...
1,highlight the parts (if any) of this contract ...,"(b) The worth, at the time of the award, of th..."
2,highlight the parts (if any) of this contract ...,The Operator shall elect pursuant to Section 2...
3,highlight the parts (if any) of this contract ...,Except as set forth on the attached UCC Filing...
4,highlight the parts (if any) of this contract ...,"sales proceeds and all other income, issues, p..."
...,...,...
34656,highlight the parts (if any) of this contract ...,7. Notices. Any notices or other communication...
34657,highlight the parts (if any) of this contract ...,The invalidity or unenforceability of any prov...
34658,highlight the parts (if any) of this contract ...,"“Obligations” means any costs, expenses or lia..."
34659,highlight the parts (if any) of this contract ...,"(a) It is duly organized, validly existing, an..."


In [41]:
df["clause_type"] = df["question"].apply(lambda row: re.findall('"([^"]*)"', row)[0])

In [42]:
# Create smaller training sets to test effects of training size
df_5k = df.sample(5000)
df_10k = df.sample(10000)
df_20k = df.sample(20000)

In [43]:
df_20k

Unnamed: 0,question,clause,clause_type
2813,highlight the parts (if any) of this contract ...,"Except as provided in Sections 4 and 5 above, ...",survival of prior agreement
9369,highlight the parts (if any) of this contract ...,c. The cleaning solution should not be sprayed...,maintenance instructions
29462,highlight the parts (if any) of this contract ...,"Award Recipient’s estate, personal representat...",tax withholding
10026,highlight the parts (if any) of this contract ...,"On September 30, 2005, Purchaser and Seller en...",asset purchase
5550,highlight the parts (if any) of this contract ...,1. Tenant Improvements. Reference herein to “...,tenant improvements
...,...,...,...
32369,highlight the parts (if any) of this contract ...,"(c) Comply in all respects with all statutes, ...",statutory compliance
2301,highlight the parts (if any) of this contract ...,"(b)recoup from the Participant if, in the Comm...",clawback provision
16721,highlight the parts (if any) of this contract ...,(c) If the Borrower shall be required to...,tax gross-up
33143,highlight the parts (if any) of this contract ...,"WHEREAS, Assignor, on August 15, 2001, entered...",incorporation by reference


In [44]:
df[df["clause_type"] == "indemnification"].iloc[50]["question"]

'highlight the parts (if any) of this contract related to "indemnification" that should be reviewed by a lawyer. details: is there an indemnification provision that requires one party to indemnify the counterparty for certain losses or liabilities?'

In [45]:
df.iloc[0]["clause_type"]

'supply of documents'

In [88]:
df_20k.groupby(['clause_type']).count().sort_values(by='clause', ascending=False)

Unnamed: 0_level_0,question,clause
clause_type,Unnamed: 1_level_1,Unnamed: 2_level_1
stock options,483,483
severability,267,267
termination for cause,240,240
indemnification,212,212
termination of employment,208,208
...,...,...
stock options adjustment,1,1
stock options after death,1,1
stock options and RSUs,1,1
stock options and employment,1,1


## Create dual/cross encoder training sets

## orig

Create query, positive, negative triplets from the dataframe -- the below code cells for creating the data for the dual/cross encoders is repeated many times. Sample three negatives for each positive.

In [14]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(df))):
    
    category = df.iloc[row]["clause_type"]
    
    query = df.iloc[row]["question"].replace("\xa0", "").lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df.iloc[row]["clause"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split()).lower()
    
    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df[df["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["clause"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|██████████████████████████████████████████████████████████████| 34661/34661 [02:55<00:00, 197.93it/s]


In [15]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

34661


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,(e) The Company shall furnish to the Inve...,[this lock-up and leak-out and release agreeme...
1,highlight the parts (if any) of this contract ...,"(b) The worth, at the time of the award, of th...",[5. rights of grantee during restricted period...
2,highlight the parts (if any) of this contract ...,The Operator shall elect pursuant to Section 2...,[(iv) use of funds in accordance with operatin...
3,highlight the parts (if any) of this contract ...,Except as set forth on the attached UCC Filing...,[l. breach of representations and warranties b...
4,highlight the parts (if any) of this contract ...,"sales proceeds and all other income, issues, p...",[to landlord: landlord understands that collie...


### Dual encoder

Create `[anchor, positive, negative triplets]` for the dual encoder

In [16]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)


103983

In [17]:
triplet_train_exploded_df.head()

Unnamed: 0,anchor,positive_match,negative_match
0,highlight the parts (if any) of this contract ...,(e) The Company shall furnish to the Inve...,this lock-up and leak-out and release agreemen...
1,highlight the parts (if any) of this contract ...,(e) The Company shall furnish to the Inve...,7.6 headings. the headings and captions used i...
2,highlight the parts (if any) of this contract ...,(e) The Company shall furnish to the Inve...,he has seven (7) days following the date of hi...
3,highlight the parts (if any) of this contract ...,"(b) The worth, at the time of the award, of th...",5. rights of grantee during restricted period....
4,highlight the parts (if any) of this contract ...,"(b) The worth, at the time of the award, of th...",23. no exculpatory provisions which may be con...


In [19]:
triplet_train_exploded_df["clause_type"] = triplet_train_exploded_df["anchor"].apply(lambda row: re.findall('"([^"]*)"', row)[0])

In [70]:
triplet_train_exploded_df.to_csv("./data/train_synthetic_cuad_dual_encoder.csv")

### Cross encoder

Create `[query, result, label]` examples for cross encoder

In [72]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [73]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [76]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded["label"] = 0

In [77]:
len(anchor_negatives_exploded)

103983

In [78]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()

In [79]:
cross_encoder_train_data[34659:].head()

Unnamed: 0,index,query,text,label
34659,34659,highlight the parts (if any) of this contract ...,"(a) It is duly organized, validly existing, an...",1
34660,34660,highlight the parts (if any) of this contract ...,(e) Independent Lender Obligations. The failur...,1
34661,0,highlight the parts (if any) of this contract ...,"12. all demands, notices and communications re...",0
34662,1,highlight the parts (if any) of this contract ...,4.1.1 the term “basic rent” for any month duri...,0
34663,2,highlight the parts (if any) of this contract ...,"(b) the execution, delivery and performance of...",0


In [80]:
cross_encoder_train_data.to_csv("./data/train_synthetic_cuad_cross_encoder.csv")

Final check

In [81]:
train_cuad_cross_encoder = pd.read_csv("./data/train_synthetic_cuad_cross_encoder.csv")

In [2]:
train_cuad_dual_encoder = pd.read_csv("./data/train_synthetic_cuad_dual_encoder.csv")

In [83]:
len(train_cuad_cross_encoder)

138644

In [3]:
len(train_cuad_dual_encoder)

103983

## 5k training set

In [50]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(df_5k))):
    
    category = df_5k.iloc[row]["clause_type"]
    
    query = df_5k.iloc[row]["question"].replace("\xa0", "").lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df_5k.iloc[row]["clause"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split()).lower()
    
    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df_5k[df_5k["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["clause"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|███████████████████████████████████████████████| 5000/5000 [00:11<00:00, 448.91it/s]


In [51]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

5000


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,3. Tax Rates — For purposes of determining the...,[2.3.1 price and invoicing. provider will invo...
1,highlight the parts (if any) of this contract ...,(c) the financial statements of the Company ...,[3 section 2.1 grant of security interest. 3 s...
2,highlight the parts (if any) of this contract ...,(B) prior to the vote contemplated by section ...,[principal life insurance company by: /s/ chri...
3,highlight the parts (if any) of this contract ...,"3. Conflicting Employment. I agree that, durin...",[(3) “good reason” means a material breach by ...
4,highlight the parts (if any) of this contract ...,(2) Lack of Good Faith. To indemnify Ind...,[ii) delivery of common stock upon conversion....


### Dual encoder

In [52]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)

15000

In [53]:
triplet_train_exploded_df["clause_type"] = triplet_train_exploded_df["anchor"].apply(lambda row: re.findall('"([^"]*)"', row)[0])

In [58]:
triplet_train_exploded_df.groupby(["clause_type"]).count().sort_values(by="anchor", ascending=False)

Unnamed: 0_level_0,anchor,positive_match,negative_match
clause_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
stock options,366,366,366
severability,231,231,231
indemnification,168,168,168
termination for cause,168,168,168
termination of employment,162,162,162
...,...,...,...
salary review,3,3,3
salary and stock options,3,3,3
salary and bonus,3,3,3
salary adjustment,3,3,3


In [59]:
triplet_train_exploded_df.to_csv("./data/cuad_data_synthetic/train_cuad_dual_encoder_5k.csv")

### Cross encoder

In [60]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [61]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [62]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded["label"] = 0


In [63]:
len(anchor_negatives_exploded)

15000

In [64]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()

In [65]:
cross_encoder_train_data[4999:].head()

Unnamed: 0,index,query,text,label
4999,4999,highlight the parts (if any) of this contract ...,". To the extent permitted by law, (i) all of ...",1
5000,0,highlight the parts (if any) of this contract ...,2.3.1 price and invoicing. provider will invoi...,0
5001,1,highlight the parts (if any) of this contract ...,(b) full legal name of registered holder (if n...,0
5002,2,highlight the parts (if any) of this contract ...,(a) all corporate and other actions required t...,0
5003,3,highlight the parts (if any) of this contract ...,3 section 2.1 grant of security interest. 3 se...,0


In [66]:
cross_encoder_train_data.to_csv("./data/cuad_data_synthetic/train_cuad_cross_encoder_5k.csv")

## 10k training set

In [67]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(df_10k))):
    
    category = df_10k.iloc[row]["clause_type"]
    
    query = df_10k.iloc[row]["question"].replace("\xa0", "").lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df_10k.iloc[row]["clause"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split()).lower()
    
    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df_10k[df_10k["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["clause"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|█████████████████████████████████████████████| 10000/10000 [00:23<00:00, 421.18it/s]


In [68]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

10000


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,(2) An “eligible retirement plan” is an indivi...,[(e) reimbursement of business expenses. the c...
1,highlight the parts (if any) of this contract ...,"d. No Seller or the Acquired Company, or ...","[now, therefore, as a material inducement to l..."
2,highlight the parts (if any) of this contract ...,provided that the Company shall not be require...,[subsidiary jurisdiction of organization perce...
3,highlight the parts (if any) of this contract ...,This Award and the rights and privileges confe...,[at no time can employees use internet bulleti...
4,highlight the parts (if any) of this contract ...,"3. The Benefit shall be earned, a...",[(f) governing law. this agreement shall be go...


### Dual encoder

In [69]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)

30000

In [70]:
triplet_train_exploded_df["clause_type"] = triplet_train_exploded_df["anchor"].apply(lambda row: re.findall('"([^"]*)"', row)[0])

In [73]:
triplet_train_exploded_df.groupby(["clause_type"]).count().sort_values(by="anchor", ascending=False)

Unnamed: 0_level_0,anchor,positive_match,negative_match
clause_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
stock options,756,756,756
severability,420,420,420
termination for cause,348,348,348
termination of employment,336,336,336
indemnification,327,327,327
...,...,...,...
revolving loan,3,3,3
right of entry,3,3,3
right of future funding,3,3,3
right of way and access,3,3,3


In [74]:
triplet_train_exploded_df.to_csv("./data/cuad_data_synthetic/train_cuad_dual_encoder_10k.csv")


### Cross encoder

In [75]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [76]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [77]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded["label"] = 0

In [78]:
len(anchor_negatives_exploded)

30000

In [79]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()

In [80]:
cross_encoder_train_data[9999:].head()

Unnamed: 0,index,query,text,label
9999,9999,highlight the parts (if any) of this contract ...,(b) The Plan shall inure to the benefit of and...,1
10000,0,highlight the parts (if any) of this contract ...,(e) reimbursement of business expenses. the co...,0
10001,1,highlight the parts (if any) of this contract ...,amended and restated participation a-1 certifi...,0
10002,2,highlight the parts (if any) of this contract ...,11.effect of amendment. this amendment is effe...,0
10003,3,highlight the parts (if any) of this contract ...,"now, therefore, as a material inducement to le...",0


In [81]:
cross_encoder_train_data.to_csv("./data/cuad_data_synthetic/train_cuad_cross_encoder_10k.csv")

## 20k training set

In [82]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(df_20k))):
    
    category = df_20k.iloc[row]["clause_type"]
    
    query = df_20k.iloc[row]["question"].replace("\xa0", "").lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df_20k.iloc[row]["clause"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split()).lower()
    
    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df_20k[df_20k["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["clause"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|█████████████████████████████████████████████| 20000/20000 [01:07<00:00, 297.33it/s]


In [83]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

20000


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,"Except as provided in Sections 4 and 5 above, ...",[(p) the undersigned is aware that no federal ...
1,highlight the parts (if any) of this contract ...,c. The cleaning solution should not be sprayed...,"[(vi) the participant’s conviction of, or entr..."
2,highlight the parts (if any) of this contract ...,"Award Recipient’s estate, personal representat...",[8. non-solicitation; non-interference. employ...
3,highlight the parts (if any) of this contract ...,"On September 30, 2005, Purchaser and Seller en...","[“phase iii clinical trials” shall mean, with ..."
4,highlight the parts (if any) of this contract ...,1. Tenant Improvements. Reference herein to “...,[2.6 the closing. subject to the fulfillment o...


### Dual encoder

In [84]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)

60000

In [85]:
triplet_train_exploded_df.head()

Unnamed: 0,anchor,positive_match,negative_match
0,highlight the parts (if any) of this contract ...,"Except as provided in Sections 4 and 5 above, ...",(p) the undersigned is aware that no federal o...
1,highlight the parts (if any) of this contract ...,"Except as provided in Sections 4 and 5 above, ...",“securities act” means the securities act of 1...
2,highlight the parts (if any) of this contract ...,"Except as provided in Sections 4 and 5 above, ...",section 4.05 entire agreement; amendments. thi...
3,highlight the parts (if any) of this contract ...,c. The cleaning solution should not be sprayed...,"(vi) the participant’s conviction of, or entry..."
4,highlight the parts (if any) of this contract ...,c. The cleaning solution should not be sprayed...,"ee. “person” means any individual, corporation..."


In [86]:
triplet_train_exploded_df["clause_type"] = triplet_train_exploded_df["anchor"].apply(lambda row: re.findall('"([^"]*)"', row)[0])

In [89]:
triplet_train_exploded_df.groupby(["clause_type"]).count().sort_values(by="anchor", ascending=False)

Unnamed: 0_level_0,anchor,positive_match,negative_match
clause_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
stock options,1449,1449,1449
severability,801,801,801
termination for cause,720,720,720
indemnification,636,636,636
termination of employment,624,624,624
...,...,...,...
stock plan administration,3,3,3
stock plan amendment,3,3,3
financial information disclosure,3,3,3
stock power,3,3,3


In [90]:
triplet_train_exploded_df.to_csv("./data/cuad_data_synthetic/train_cuad_dual_encoder_20k.csv")


### Cross encoder

In [91]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [92]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [93]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded["label"] = 0


In [94]:
len(anchor_negatives_exploded)

60000

In [95]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()

In [96]:
cross_encoder_train_data[19999:].head()

Unnamed: 0,index,query,text,label
19999,19999,highlight the parts (if any) of this contract ...,23.Right to Terminate Employment. Nothing in ...,1
20000,0,highlight the parts (if any) of this contract ...,(p) the undersigned is aware that no federal o...,0
20001,1,highlight the parts (if any) of this contract ...,“securities act” means the securities act of 1...,0
20002,2,highlight the parts (if any) of this contract ...,section 4.05 entire agreement; amendments. thi...,0
20003,3,highlight the parts (if any) of this contract ...,"(vi) the participant’s conviction of, or entry...",0


In [97]:
cross_encoder_train_data.to_csv("./data/cuad_data_synthetic/train_cuad_cross_encoder_20k.csv")


# Synthetic discriminative contracts

In [2]:
data_path = "./data/cuad_data_synthetic_discriminative/synthetic_contracts_discriminative.jsonl"

In [3]:
all_query_clauses = []
with open(data_path, "r") as reader:
    for obj in reader:
        all_query_clauses.append(obj)

all_query_clauses = [json.loads(el) for el in all_query_clauses]

In [4]:
len(all_query_clauses)

19058

In [5]:
df = pd.DataFrame.from_dict(all_query_clauses)
df

Unnamed: 0,question,clause,clause_type
0,highlight the parts (if any) of this contract...,The Plan as set forth herein constitutes an am...,effective date reference
1,highlight the parts (if any) of this contract...,"(b) individuals who, as of the Effective Da...",change of control
2,highlight the parts (if any) of this contract...,(c) consummation by the Company of a reorga...,merger restrictions
3,highlight the parts (if any) of this contract...,"Notwithstanding the foregoing, with respect to...",change of control
4,highlight the parts (if any) of this contract...,"Subject to Section 5.1(b), Section 5.1(d) and ...",effective date reference
...,...,...,...
19053,highlight the parts (if any) of this contract...,(m) Indebtedness consisting of (i) the financi...,insurance
19054,highlight the parts (if any) of this contract...,"7.05 Fundamental Changes. Merge, dissolve, liq...",merger restrictions
19055,highlight the parts (if any) of this contract...,(a) any Restricted Subsidiary may merge with (...,merger restrictions
19056,highlight the parts (if any) of this contract...,(d) any Restricted Subsidiary may merge with a...,merger restrictions


In [6]:
len(set(df["clause_type"]))

27

In [7]:
set(df["clause_type"])

{'affiliate license-licensee',
 'auditor opinion',
 'cap on liability',
 'change of control',
 'confidential information form',
 'confidential period',
 'covenant not to sue',
 'cross default',
 'dispute resolution',
 'effective date main',
 'effective date reference',
 'financial statements',
 'governing document',
 'governing law',
 'income summary',
 'insurance',
 'litigation default',
 'main objective',
 'merger restrictions',
 'no solicitation',
 'post-termination services',
 'price restrictions',
 'reserves policy',
 'tax changes call',
 'third party beneficiary',
 'trustee appointment',
 'warranty duration'}

In [8]:
len(set(df["question"]))

10966

In [9]:
df2 = df[df["clause_type"] == "merger restrictions"]

In [10]:
df2.iloc[6]["question"]

' highlight the parts (if any) of this contract related to "successor liability" that should be reviewed by a lawyer. details: does the contract contain a clause that requires a successor entity to assume the obligations of the original contracting party in the event of a merger, acquisition, or other reorganization?'

In [11]:
' '.join(df2.iloc[8]["question"].split())

'highlight the parts (if any) of this contract related to "transfer of assets" that should be reviewed by a lawyer. details: is consent or notice required of a party if it transfers all or substantially all of its assets to a third party?'

In [34]:
' '.join(sample.replace('\n',' ').split()).lower()

'the plan as set forth herein constitutes an amendment and restatement of the company’s 2012 omnibus incentive compensation plan as in effect immediately prior to the effective date (the “prior plan”). the prior plan replaced the company’s 2008 omnibus incentive compensation plan effective as of may 15, 2012, and no further awards have or will be made under such 2008 omnibus incentive compensation plan from and after such date. the plan shall supersede and replace in its entirety the prior plan; provided, however, that, notwithstanding any provisions herein to the contrary, each award granted under the prior plan prior to the effective date shall be subject to the terms and provisions applicable to such award under the prior plan as in effect immediately prior to the effective date.'

In [25]:
query = df.iloc[33]["question"].strip()
query


'highlight the parts (if any) of this contract related to "term modification" that should be reviewed by a lawyer. details: does the contract contain a modification or amendment to a term or condition from a prior agreement?'

In [36]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(df))):
    
    category = df.iloc[row]["clause_type"]
    
    query = df.iloc[row]["question"].replace("\xa0", "").strip().lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df.iloc[row]["clause"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split())
    positive_sample = positive_sample.lower()

    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df[df["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["clause"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|███████████████████████████████████████████████████████████████| 19058/19058 [00:54<00:00, 352.73it/s]


In [37]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

19058


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,the plan as set forth herein constitutes an am...,[no amendment to or modification or waiver of ...
1,highlight the parts (if any) of this contract ...,"(b) individuals who, as of the effective da...",[“total assets” means the total assets of the ...
2,highlight the parts (if any) of this contract ...,(c) consummation by the company of a reorga...,[section 6.05. notice of adverse change. promp...
3,highlight the parts (if any) of this contract ...,"notwithstanding the foregoing, with respect to...",[whereas the issuer is issuing the class a-1 1...
4,highlight the parts (if any) of this contract ...,"subject to section 5.1(b), section 5.1(d) and ...",[(e) business interruption insurance caused by...


In [28]:
triplet_train_dataframe.iloc[0]["anchor"]

'highlight the parts (if any) of this contract related to "superseding agreement" that should be reviewed by a lawyer. details: does this contract supersede or replace a prior agreement between the parties?'

## Generate queries using the original clause types

In [80]:
def create_query(clause):
    
    return f'highlight the parts (if any) of this contract related to "{clause}" that should be reviewed by a lawyer.'


In [68]:
create_query('test')

'highlight the parts (if any) of this contract related to "test" that should be reviewed by a lawyer.'

In [78]:
print(len(df))
df[:10]

19058


Unnamed: 0,question,clause,clause_type,query
0,highlight the parts (if any) of this contract...,The Plan as set forth herein constitutes an am...,effective date reference,highlight the parts (if any) of this contract ...
1,highlight the parts (if any) of this contract...,"(b) individuals who, as of the Effective Da...",change of control,highlight the parts (if any) of this contract ...
2,highlight the parts (if any) of this contract...,(c) consummation by the Company of a reorga...,merger restrictions,highlight the parts (if any) of this contract ...
3,highlight the parts (if any) of this contract...,"Notwithstanding the foregoing, with respect to...",change of control,highlight the parts (if any) of this contract ...
4,highlight the parts (if any) of this contract...,"Subject to Section 5.1(b), Section 5.1(d) and ...",effective date reference,highlight the parts (if any) of this contract ...
5,highlight the parts (if any) of this contract...,Each share of Common Stock subject to a Full V...,effective date reference,highlight the parts (if any) of this contract ...
6,highlight the parts (if any) of this contract...,Subject to adjustment as provided in Section 5...,effective date reference,highlight the parts (if any) of this contract ...
7,highlight the parts (if any) of this contract...,of Common Stock repurchased on the open market...,effective date reference,highlight the parts (if any) of this contract ...
8,highlight the parts (if any) of this contract...,Any shares of Common Stock that again become a...,effective date reference,highlight the parts (if any) of this contract ...
9,highlight the parts (if any) of this contract...,No Option or Stock Appreciation Right granted ...,change of control,highlight the parts (if any) of this contract ...


In [62]:
df.iloc[0]["question"]

' highlight the parts (if any) of this contract related to "superseding agreement" that should be reviewed by a lawyer. details: does this contract supersede or replace a prior agreement between the parties?'

In [69]:
df["query"] = df["clause_type"].apply(lambda row: create_query(row))

In [81]:
df.iloc[2]["query"]

'highlight the parts (if any) of this contract related to "merger restrictions" that should be reviewed by a lawyer.'

In [83]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(df))):
    
    category = df.iloc[row]["clause_type"]
    
    query = df.iloc[row]["query"].replace("\xa0", "").strip().lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df.iloc[row]["clause"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split())
    positive_sample = positive_sample.lower()

    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df[df["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["clause"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|██████████████████████████████████████████████████████████████| 19058/19058 [01:02<00:00, 305.27it/s]


In [85]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

19058


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,the plan as set forth herein constitutes an am...,[the entering into and the performance of this...
1,highlight the parts (if any) of this contract ...,"(b) individuals who, as of the effective da...",[this amended endorsement split dollar agreeme...
2,highlight the parts (if any) of this contract ...,(c) consummation by the company of a reorga...,"[term. this agreement shall terminate, except ..."
3,highlight the parts (if any) of this contract ...,"notwithstanding the foregoing, with respect to...",[5.4 financial statements. the financial state...
4,highlight the parts (if any) of this contract ...,"subject to section 5.1(b), section 5.1(d) and ...",[benefits as a full-time employee you will be ...


In [88]:
triplet_train_dataframe["clause_type"] = triplet_train_dataframe["anchor"].apply(lambda row: re.findall('"([^"]*)"', row)[0])


In [90]:
triplet_train_dataframe.iloc[0]["anchor"]

'highlight the parts (if any) of this contract related to "effective date reference" that should be reviewed by a lawyer.'

In [132]:
triplet_train_dataframe.head()

Unnamed: 0,anchor,positive,negatives,clause_type
0,highlight the parts (if any) of this contract ...,the plan as set forth herein constitutes an am...,[the entering into and the performance of this...,effective date reference
1,highlight the parts (if any) of this contract ...,"(b) individuals who, as of the effective da...",[this amended endorsement split dollar agreeme...,change of control
2,highlight the parts (if any) of this contract ...,(c) consummation by the company of a reorga...,"[term. this agreement shall terminate, except ...",merger restrictions
3,highlight the parts (if any) of this contract ...,"notwithstanding the foregoing, with respect to...",[5.4 financial statements. the financial state...,change of control
4,highlight the parts (if any) of this contract ...,"subject to section 5.1(b), section 5.1(d) and ...",[benefits as a full-time employee you will be ...,effective date reference


### Dual encoder

In [91]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)

57174

In [92]:
triplet_train_exploded_df.head()

Unnamed: 0,anchor,positive_match,negative_match
0,highlight the parts (if any) of this contract ...,the plan as set forth herein constitutes an am...,the entering into and the performance of this ...
1,highlight the parts (if any) of this contract ...,the plan as set forth herein constitutes an am...,"(c) individuals who, as of the date hereof, co..."
2,highlight the parts (if any) of this contract ...,the plan as set forth herein constitutes an am...,"for purposes of this plan, ""change in control ..."
3,highlight the parts (if any) of this contract ...,"(b) individuals who, as of the effective da...",this amended endorsement split dollar agreemen...
4,highlight the parts (if any) of this contract ...,"(b) individuals who, as of the effective da...",a. the parties agree to modify their ongoing b...


In [93]:
triplet_train_exploded_df.to_csv("./data/cuad_data_synthetic_discriminative/train_synthetic_discriminative_mod_query_orig_clause_cat_cuad_dual_encoder.csv")


### Cross encoder

In [94]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [95]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [96]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded["label"] = 0

In [97]:
len(anchor_negatives_exploded)

57174

In [98]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()

In [99]:
cross_encoder_train_data[19057:].head()

Unnamed: 0,index,query,text,label
19057,19057,highlight the parts (if any) of this contract ...,(e) each of the borrower and any of its restri...,1
19058,0,highlight the parts (if any) of this contract ...,the entering into and the performance of this ...,0
19059,1,highlight the parts (if any) of this contract ...,"(c) individuals who, as of the date hereof, co...",0
19060,2,highlight the parts (if any) of this contract ...,"for purposes of this plan, ""change in control ...",0
19061,3,highlight the parts (if any) of this contract ...,this amended endorsement split dollar agreemen...,0


In [109]:
len(cross_encoder_train_data)

76232

In [110]:
cross_encoder_train_data.to_csv("./data/cuad_data_synthetic_discriminative/train_synthetic_discriminative_mod_query_orig_clause_cat_cuad_cross_encoder.csv")


final check

In [115]:
df1 = pd.read_csv("./data/cuad_data_synthetic_targeted/train_synthetic_discriminative_mod_query_orig_clause_cat_cuad_dual_encoder.csv")

df2 = pd.read_csv("./data/cuad_data_synthetic_targeted/train_synthetic_discriminative_mod_query_orig_clause_cat_cuad_cross_encoder.csv")

In [116]:
len(df1)

57174

In [117]:
len(df2)

76232

In [130]:
df1.iloc[5]["anchor"]

'highlight the parts (if any) of this contract related to "change of control" that should be reviewed by a lawyer.'

In [131]:
df2.iloc[3]["query"]

'highlight the parts (if any) of this contract related to "change of control" that should be reviewed by a lawyer.'

# Squad

In [2]:
data_path = "./data/squad_data/synthetic_contracts_squad.jsonl"

In [3]:
all_query_clauses = []
with open(data_path, "r") as reader:
    for obj in reader:
        all_query_clauses.append(obj)

all_query_clauses = [json.loads(el) for el in all_query_clauses]

In [4]:
len(all_query_clauses)

34661

In [5]:
df = pd.DataFrame.from_dict(all_query_clauses)
df

Unnamed: 0,question,clause
0,Will the Company provide the Investor Represen...,(e) The Company shall furnish to the Inve...
1,What is the worth of the award based on in thi...,"(b) The worth, at the time of the award, of th..."
2,Who is responsible for collecting and remittin...,The Operator shall elect pursuant to Section 2...
3,What is the total balance owing for the liens ...,Except as set forth on the attached UCC Filing...
4,What is the Assigned Property in this clause?,"sales proceeds and all other income, issues, p..."
...,...,...
34656,How should notices or communications be delive...,7. Notices. Any notices or other communication...
34657,What happens if a provision in this offer lett...,The invalidity or unenforceability of any prov...
34658,"What does the term ""Obligations"" mean in the c...","“Obligations” means any costs, expenses or lia..."
34659,Does the company have the authority to execute...,"(a) It is duly organized, validly existing, an..."


In [7]:
df.iloc[0]['question']

'Will the Company provide the Investor Representative with a copy of the Registration Statement and its amendments?'

In [9]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []


for row in tqdm(range(len(df))):
        
    query = df.iloc[row]["question"].replace("\xa0", "").strip().lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df.iloc[row]["clause"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split())
    positive_sample = positive_sample.lower()

    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df[df["clause"] != positive_sample].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["clause"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
        
    negatives_train.append(negative_samples)
    


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 34661/34661 [02:40<00:00, 215.74it/s]


In [10]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

34661


Unnamed: 0,anchor,positive,negatives
0,will the company provide the investor represen...,(e) the company shall furnish to the inve...,[* accrued obligations paid within thirty (30)...
1,what is the worth of the award based on in thi...,"(b) the worth, at the time of the award, of th...",[(f) perquisites. in addition to the other ben...
2,who is responsible for collecting and remittin...,the operator shall elect pursuant to section 2...,[in consideration of the agreement by harman i...
3,what is the total balance owing for the liens ...,except as set forth on the attached ucc filing...,[section 25.1 independent contractor. this agr...
4,what is the assigned property in this clause?,"sales proceeds and all other income, issues, p...",[(v) if the executive states in the executive ...


### Dual encoder

In [11]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)

103983

In [12]:
triplet_train_exploded_df.head()

Unnamed: 0,anchor,positive_match,negative_match
0,will the company provide the investor represen...,(e) the company shall furnish to the inve...,* accrued obligations paid within thirty (30) ...
1,will the company provide the investor represen...,(e) the company shall furnish to the inve...,7.10 use of proceeds. the borrower shall use t...
2,will the company provide the investor represen...,(e) the company shall furnish to the inve...,"(b) stock units. the committee may, in its dis..."
3,what is the worth of the award based on in thi...,"(b) the worth, at the time of the award, of th...",(f) perquisites. in addition to the other bene...
4,what is the worth of the award based on in thi...,"(b) the worth, at the time of the award, of th...","14. amendment. the committee may amend, modify..."


In [13]:
triplet_train_exploded_df.to_csv("./data/squad_data/train_squad_dual_encoder.csv")


### Cross encoder

In [14]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [15]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [16]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded["label"] = 0

In [17]:
len(anchor_negatives_exploded)

103983

In [18]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()

In [20]:
cross_encoder_train_data[34660:].head()

Unnamed: 0,index,query,text,label
34660,34660,is any lender responsible for the failure of a...,(e) independent lender obligations. the failur...,1
34661,0,will the company provide the investor represen...,* accrued obligations paid within thirty (30) ...,0
34662,1,will the company provide the investor represen...,7.10 use of proceeds. the borrower shall use t...,0
34663,2,will the company provide the investor represen...,"(b) stock units. the committee may, in its dis...",0
34664,3,what is the worth of the award based on in thi...,(f) perquisites. in addition to the other bene...,0


In [21]:
len(cross_encoder_train_data)

138644

In [22]:
cross_encoder_train_data.to_csv("./data/squad_data/train_squad_cross_encoder.csv")


final check

In [3]:
df1 = pd.read_csv("./data/squad_data/train_squad_cross_encoder.csv")

In [4]:
df2 = pd.read_csv("./data/squad_data/train_squad_dual_encoder.csv")

In [5]:
len(df1)

138644

In [6]:
len(df2)

103983

In [7]:
df1.head()

Unnamed: 0.1,Unnamed: 0,index,query,text,label
0,0,0,will the company provide the investor represen...,(e) the company shall furnish to the inve...,1
1,1,1,what is the worth of the award based on in thi...,"(b) the worth, at the time of the award, of th...",1
2,2,2,who is responsible for collecting and remittin...,the operator shall elect pursuant to section 2...,1
3,3,3,what is the total balance owing for the liens ...,except as set forth on the attached ucc filing...,1
4,4,4,what is the assigned property in this clause?,"sales proceeds and all other income, issues, p...",1


In [8]:
df2.head()

Unnamed: 0.1,Unnamed: 0,anchor,positive_match,negative_match
0,0,will the company provide the investor represen...,(e) the company shall furnish to the inve...,* accrued obligations paid within thirty (30) ...
1,1,will the company provide the investor represen...,(e) the company shall furnish to the inve...,7.10 use of proceeds. the borrower shall use t...
2,2,will the company provide the investor represen...,(e) the company shall furnish to the inve...,"(b) stock units. the committee may, in its dis..."
3,3,what is the worth of the award based on in thi...,"(b) the worth, at the time of the award, of th...",(f) perquisites. in addition to the other bene...
4,4,what is the worth of the award based on in thi...,"(b) the worth, at the time of the award, of th...","14. amendment. the committee may amend, modify..."


In [9]:
df1[34660:]

Unnamed: 0.1,Unnamed: 0,index,query,text,label
34660,34660,34660,is any lender responsible for the failure of a...,(e) independent lender obligations. the failur...,1
34661,34661,0,will the company provide the investor represen...,* accrued obligations paid within thirty (30) ...,0
34662,34662,1,will the company provide the investor represen...,7.10 use of proceeds. the borrower shall use t...,0
34663,34663,2,will the company provide the investor represen...,"(b) stock units. the committee may, in its dis...",0
34664,34664,3,what is the worth of the award based on in thi...,(f) perquisites. in addition to the other bene...,0
...,...,...,...,...,...
138639,138639,103978,does the company have the authority to execute...,"name of lender: the bank of east asia, limited...",0
138640,138640,103979,does the company have the authority to execute...,"(b) the provisions of this agreement, and any ...",0
138641,138641,103980,is any lender responsible for the failure of a...,(i) the sum payable shall be increased as may ...,0
138642,138642,103981,is any lender responsible for the failure of a...,section 14. governing law; jurisdiction. this ...,0


# Synthetic chatgpt

In [36]:
data_path = "./data/cuad_data_synthetic_chatgpt/synthetic_contracts_cuad_chatgptturbo.jsonl"

In [37]:
def obtain_clause_type(row):
    try:
        return re.findall('"([^"]*)"', row)[0]
    
    except IndexError:
        return ""

In [38]:
all_query_clauses = []
with open(data_path, "r") as reader:
    for obj in reader:
        all_query_clauses.append(obj)

all_query_clauses = [json.loads(el) for el in all_query_clauses]

In [39]:
len(all_query_clauses)

34648

In [40]:
df = pd.DataFrame.from_dict(all_query_clauses)
df

Unnamed: 0,question,clause
0,highlight the parts (if any) of this contract ...,(e) The Company shall furnish to the Inve...
1,highlight the parts (if any) of this contract ...,"(b) The worth, at the time of the award, of th..."
2,highlight the parts (if any) of this contract ...,The Operator shall elect pursuant to Section 2...
3,highlight the parts (if any) of this contract ...,Except as set forth on the attached UCC Filing...
4,highlight the parts (if any) of this contract ...,"sales proceeds and all other income, issues, p..."
...,...,...
34643,highlight the parts (if any) of this contract ...,7. Notices. Any notices or other communication...
34644,highlight the parts (if any) of this contract ...,The invalidity or unenforceability of any prov...
34645,highlight the parts (if any) of this contract ...,"“Obligations” means any costs, expenses or lia..."
34646,highlight the parts (if any) of this contract ...,"(a) It is duly organized, validly existing, an..."


In [42]:
df = pd.DataFrame.from_dict(all_query_clauses)
print(len(df))
df.head()

34648


Unnamed: 0,question,clause
0,highlight the parts (if any) of this contract ...,(e) The Company shall furnish to the Inve...
1,highlight the parts (if any) of this contract ...,"(b) The worth, at the time of the award, of th..."
2,highlight the parts (if any) of this contract ...,The Operator shall elect pursuant to Section 2...
3,highlight the parts (if any) of this contract ...,Except as set forth on the attached UCC Filing...
4,highlight the parts (if any) of this contract ...,"sales proceeds and all other income, issues, p..."


In [43]:
df.iloc[0]["question"]

'highlight the parts (if any) of this contract related to "registration rights" that should be reviewed by a lawyer. details: does one party have the right to require the other party to register securities with the SEC or other regulatory body?'

In [44]:
df["clause_type"] = df["question"].apply(lambda row: obtain_clause_type(row))

In [45]:
df.head()

Unnamed: 0,question,clause,clause_type
0,highlight the parts (if any) of this contract ...,(e) The Company shall furnish to the Inve...,registration rights
1,highlight the parts (if any) of this contract ...,"(b) The worth, at the time of the award, of th...",damages for breach of lease
2,highlight the parts (if any) of this contract ...,The Operator shall elect pursuant to Section 2...,tax administration
3,highlight the parts (if any) of this contract ...,Except as set forth on the attached UCC Filing...,liens
4,highlight the parts (if any) of this contract ...,"sales proceeds and all other income, issues, p...",assignment of property rights


In [47]:
# remove the empty clause types
df = df[df["clause_type"] != ""]

In [48]:
df.groupby(['clause_type']).count().sort_values(by='clause', ascending=False)

Unnamed: 0_level_0,question,clause
clause_type,Unnamed: 1_level_1,Unnamed: 2_level_1
stock options,1376,1376
indemnification,647,647
change of control,519,519
severability,460,460
notice provisions,412,412
...,...,...
franchise agreement,1,1
founder shares,1,1
forward purchase agreements,1,1
forms and attachments,1,1


## Create dual/cross encoder training sets

In [51]:
num_negatives = 3

anchors_train = []
positives_train = []
negatives_train = []

positive_categories_train = []
negative_categories_train = []

for row in tqdm(range(len(df))):
    
    category = df.iloc[row]["clause_type"]
    
    query = df.iloc[row]["question"].replace("\xa0", "").lower()
    anchors_train.append(query)
    
    # clean up the text
    positive_sample = df.iloc[row]["clause"]
    postive_sample = ' '.join(positive_sample.replace('\n',' ').split())
    positive_sample = positive_sample.lower()
    
    positives_train.append(positive_sample)
    
    # get the negative sample and category
    potential_negative_samples = df[df["clause_type"] != category].sample(num_negatives)
    
    negative_samples = [el.lower() for el in list(potential_negative_samples["clause"])]
    negative_samples = [' '.join(negative_sample.replace('\n',' ').split()) for negative_sample in negative_samples]
    
    negative_categories = list(potential_negative_samples["clause_type"])
    
    negatives_train.append(negative_samples)
    
    # get the categories of the positive and negative samples
    positive_categories_train.append(category)
    negative_categories_train.append(negative_categories)

100%|█████████████████████████████████████████████| 34413/34413 [02:49<00:00, 202.94it/s]


In [52]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

34413


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,(e) the company shall furnish to the inve...,[5.transferability. this agreement is personal...
1,highlight the parts (if any) of this contract ...,"(b) the worth, at the time of the award, of th...",[(e) survival of indemnity and contribution pr...
2,highlight the parts (if any) of this contract ...,the operator shall elect pursuant to section 2...,[(q) additional swap counterparties. to the ex...
3,highlight the parts (if any) of this contract ...,except as set forth on the attached ucc filing...,[any copies or excerpts thereof shall remain t...
4,highlight the parts (if any) of this contract ...,"sales proceeds and all other income, issues, p...",[(k) attorneys’ fees. the parties agree that i...


In [53]:
triplet_train_dataframe = pd.DataFrame(list(zip(anchors_train, positives_train, negatives_train)),
                                    columns=["anchor", "positive", "negatives"])

print(len(triplet_train_dataframe))
triplet_train_dataframe.head()

34413


Unnamed: 0,anchor,positive,negatives
0,highlight the parts (if any) of this contract ...,(e) the company shall furnish to the inve...,[5.transferability. this agreement is personal...
1,highlight the parts (if any) of this contract ...,"(b) the worth, at the time of the award, of th...",[(e) survival of indemnity and contribution pr...
2,highlight the parts (if any) of this contract ...,the operator shall elect pursuant to section 2...,[(q) additional swap counterparties. to the ex...
3,highlight the parts (if any) of this contract ...,except as set forth on the attached ucc filing...,[any copies or excerpts thereof shall remain t...
4,highlight the parts (if any) of this contract ...,"sales proceeds and all other income, issues, p...",[(k) attorneys’ fees. the parties agree that i...


## Dual encoder

In [54]:
triplet_train_exploded_df = (
    triplet_train_dataframe
    .explode('negatives')
    .reset_index()[["anchor", "positive", "negatives"]]
    .rename({"positive" : "positive_match", "negatives": "negative_match"}, axis=1)
)  

len(triplet_train_exploded_df)

103239

In [55]:
triplet_train_exploded_df.head()

Unnamed: 0,anchor,positive_match,negative_match
0,highlight the parts (if any) of this contract ...,(e) the company shall furnish to the inve...,5.transferability. this agreement is personal ...
1,highlight the parts (if any) of this contract ...,(e) the company shall furnish to the inve...,2. original set of documents shall be provided...
2,highlight the parts (if any) of this contract ...,(e) the company shall furnish to the inve...,in the event that any provision of these stand...
3,highlight the parts (if any) of this contract ...,"(b) the worth, at the time of the award, of th...",(e) survival of indemnity and contribution pro...
4,highlight the parts (if any) of this contract ...,"(b) the worth, at the time of the award, of th...",notwithstanding any other provision in this ag...


In [56]:
triplet_train_exploded_df["clause_type"] = triplet_train_exploded_df["anchor"].apply(lambda row: re.findall('"([^"]*)"', row)[0])


In [64]:
triplet_train_exploded_df.groupby(['clause_type']).count().sort_values(by='anchor', ascending=False)

Unnamed: 0_level_0,anchor,positive_match,negative_match
clause_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
stock options,4128,4128,4128
indemnification,1941,1941,1941
change of control,1557,1557,1557
severability,1380,1380,1380
notice provisions,1236,1236,1236
...,...,...,...
guarantee assignment,3,3,3
guarantee and security interest,3,3,3
guarantee and security,3,3,3
guarantee agreements,3,3,3


In [65]:
triplet_train_exploded_df.to_csv("./data/cuad_data_synthetic_chatgpt/train_cuad_dual_encoder.csv")

## Cross encoder

In [66]:
anchor_positive = triplet_train_dataframe[["anchor", "positive"]]
anchor_negatives = triplet_train_dataframe[["anchor", "negatives"]]

In [67]:
anchor_positive = anchor_positive.rename(columns={"anchor": "query", "positive": "text"})
anchor_positive["label"] = 1

In [68]:
anchor_negatives_exploded = (
    anchor_negatives
    .explode('negatives')
    .reset_index()[["anchor", "negatives"]]
    .rename({"anchor" : "query", "negatives": "text"}, axis=1)
)    

anchor_negatives_exploded["label"] = 0

In [69]:
len(anchor_negatives_exploded)

103239

In [70]:
cross_encoder_train_data = pd.concat([anchor_positive, anchor_negatives_exploded]).reset_index()

In [71]:
cross_encoder_train_data[34411:].head()

Unnamed: 0,index,query,text,label
34411,34411,highlight the parts (if any) of this contract ...,"(a) it is duly organized, validly existing, an...",1
34412,34412,highlight the parts (if any) of this contract ...,(e) independent lender obligations. the failur...,1
34413,0,highlight the parts (if any) of this contract ...,5.transferability. this agreement is personal ...,0
34414,1,highlight the parts (if any) of this contract ...,2. original set of documents shall be provided...,0
34415,2,highlight the parts (if any) of this contract ...,in the event that any provision of these stand...,0


In [72]:
cross_encoder_train_data.to_csv("./data/cuad_data_synthetic_chatgpt/train_cuad_cross_encoder.csv")

Final check

In [77]:
df1 = pd.read_csv("./data/cuad_data_synthetic_chatgpt/train_cuad_cross_encoder.csv")

In [78]:
df2 = pd.read_csv("./data/cuad_data_synthetic_chatgpt/train_cuad_dual_encoder.csv")

In [79]:
len(df1)

137652

In [80]:
len(df2)

103239