In [None]:
from datasets import load_dataset
import unicodedata

atticus = load_dataset("pile-of-law/pile-of-law","atticus_contracts",split='validation')
#atticus = load_dataset("pile-of-law/pile-of-law","atticus_contracts",split='train')
print(atticus)

In [None]:
import pandas as pd
import numpy as np
import json
import openai
import os
from tqdm.auto import tqdm
import re
import pickle
import time
import copy
from openai import ChatCompletion

np.random.seed(1234)
with open('credentials.json','r') as f:
    credentials = json.load(f)
openai.api_key = credentials['key']
openai.organization = credentials['organization']

def get_openai_response(prompt, model='gpt-4', max_tokens=1000, temperature=0):
    output = ChatCompletion.create(model = model, 
                               messages = [{"role":"user","content":prompt}],
                               max_tokens = max_tokens,
                               temperature = temperature)
    return output['choices'][0]['message']['content']

response = get_openai_response('hello world!')
print(response)

In [None]:
def classify_clause(clause):
    return f'''
Consider the following contract clause:
<CLAUSE>
{clause}
</CLAUSE>

Does the clause describe or mention any of the following topics:
- effective date reference
- financial statements
- cross default
- governing document
- bondholders default
- effective date main
- reserves policy
- litigation default
- no solicitation
- trustee appointment
- income summary
- merger restrictions
- governing law
- confidential period
- confidential information form
- auditor opinion
- tax changes call
- main objective
- dispute resolution
- change of control
- post-termination services
- cap on liability
- insurance
- covenant not to sue
- warranty duration
- affiliate license-licensee
- third party beneficiary
- price restrictions

Respond with the following information:
$TOPIC - the topic contained in the clause, or None if no topics
$EXPLANATION - an explanation of your answer
formatted as follows on a single line:
Topic:$TOPIC|Explanation:$EXPLANATION
<|endofprompt|>
Topic:'''


def generate_prompt_squad(clause):
    return f'''

Write a search query that can be answered by the content in the given contract clause.

---

Follow the following format.

Clause: ${{the given contract clause}}
Query: ${{a search query that can be answered by the information in the contract clause}}

---

Clause: One of its earliest massive implementations was brought about by Egyptians against the British occupation in the 1919 Revolution. Civil disobedience is one of the many ways people have rebelled against what they deem to be unfair laws. It has been used in many nonviolent resistance movements in India (Gandhi's campaigns for independence from the British Empire), in Czechoslovakia's Velvet Revolution and in East Germany to oust their communist governments, In South Africa in the fight against apartheid, in the American Civil Rights Movement, in the Singing Revolution to bring independence to the Baltic countries from the Soviet Union, recently with the 2003 Rose Revolution in Georgia and the 2004 Orange Revolution in Ukraine, among other various movements worldwide.
Query: What was the the movement called that brought Baltic countries independence from the Soviet Union?

---

Clause: The Intergovernmental Panel on Climate Change (IPCC) is a scientific intergovernmental body under the auspices of the United Nations, set up at the request of member governments. It was first established in 1988 by two United Nations organizations, the World Meteorological Organization (WMO) and the United Nations Environment Programme (UNEP), and later endorsed by the United Nations General Assembly through Resolution 43/53. Membership of the IPCC is open to all members of the WMO and UNEP. The IPCC produces reports that support the United Nations Framework Convention on Climate Change (UNFCCC), which is the main international treaty on climate change. The ultimate objective of the UNFCCC is to "stabilize greenhouse gas concentrations in the atmosphere at a level that would prevent dangerous anthropogenic [i.e., human-induced] interference with the climate system". IPCC reports cover "the scientific, technical and socio-economic information relevant to understanding the scientific basis of risk of human-induced climate change, its potential impacts and options for adaptation and mitigation."
Query: What UN organizations established the IPCC?

---

Clause: The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.
Query: What percentage does the Amazon represents in rainforests on the planet?

---

Clause: The 1973 oil crisis began in October 1973 when the members of the Organization of Arab Petroleum Exporting Countries (OAPEC, consisting of the Arab members of OPEC plus Egypt and Syria) proclaimed an oil embargo. By the end of the embargo in March 1974, the price of oil had risen from US$3 per barrel to nearly $12 globally; US prices were significantly higher. The embargo caused an oil crisis, or "shock", with many short- and long-term effects on global politics and the global economy. It was later called the "first oil shock", followed by the 1979 oil crisis, termed the "second oil shock."
Query: What was the price of oil in March of 1974?

---

Clause: The French and Indian War (1754–1763) was the North American theater of the worldwide Seven Years' War. The war was fought between the colonies of British America and New France, with both sides supported by military units from their parent countries of Great Britain and France, as well as Native American allies. At the start of the war, the French North American colonies had a population of roughly 60,000 European settlers, compared with 2 million in the British North American colonies. The outnumbered French particularly depended on the Indians. Long in conflict, the metropole nations declared war on each other in 1756, escalating the war from a regional affair into an intercontinental conflict.
Query: Who fought in the French and Indian war?

---

Clause: Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table and is a highly reactive nonmetal and oxidizing agent that readily forms compounds (notably oxides) with most elements. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. However, monitoring of atmospheric oxygen levels show a global downward trend, because of fossil-fuel burning. Oxygen is the most abundant element by mass in the Earth's crust as part of oxide compounds such as silicon dioxide, making up almost half of the crust's mass.
Query: What is the second most abundant element?

---

Clause: Starting in the late 1950s, American computer scientist Paul Baran developed the concept Distributed Adaptive Message Block Switching with the goal to provide a fault-tolerant, efficient routing method for telecommunication messages as part of a research program at the RAND Corporation, funded by the US Department of Defense. This concept contrasted and contradicted the theretofore established principles of pre-allocation of network bandwidth, largely fortified by the development of telecommunications in the Bell System. The new concept found little resonance among network implementers until the independent work of Donald Davies at the National Physical Laboratory (United Kingdom) (NPL) in the late 1960s. Davies is credited with coining the modern name packet switching and inspiring numerous packet switching networks in Europe in the decade following, including the incorporation of the concept in the early ARPANET in the United States.
Query: What is Donald Davies credited with?

---

Clause: Ctenophora (/tᵻˈnɒfərə/; singular ctenophore, /ˈtɛnəfɔːr/ or /ˈtiːnəfɔːr/; from the Greek κτείς kteis 'comb' and φέρω pherō 'carry'; commonly known as comb jellies) is a phylum of animals that live in marine waters worldwide. Their most distinctive feature is the ‘combs’ – groups of cilia which they use for swimming – they are the largest animals that swim by means of cilia. Adults of various species range from a few millimeters to 1.5 m (4 ft 11 in) in size. Like cnidarians, their bodies consist of a mass of jelly, with one layer of cells on the outside and another lining the internal cavity. In ctenophores, these layers are two cells deep, while those in cnidarians are only one cell deep. Some authors combined ctenophores and cnidarians in one phylum, Coelenterata, as both groups rely on water flow through the body cavity for both digestion and respiration. Increasing awareness of the differences persuaded more recent authors to classify them as separate phyla.
Query: How large can ctenophora grow?

---

Clause: The University of Chicago (UChicago, Chicago, or U of C) is a private research university in Chicago. The university, established in 1890, consists of The College, various graduate programs, interdisciplinary committees organized into four academic research divisions and seven professional schools. Beyond the arts and sciences, Chicago is also well known for its professional schools, which include the Pritzker School of Medicine, the University of Chicago Booth School of Business, the Law School, the School of Social Service Administration, the Harris School of Public Policy Studies, the Graham School of Continuing Liberal and Professional Studies and the Divinity School. The university currently enrolls approximately 5,000 students in the College and around 15,000 students overall.
Query: How many professional schools does the University of Chicago have?

---

Clause: {clause}<|endofprompt|>
Query:'''


def generate_prompt_cuad(clause):
    return f'''

Write a search query that can be answered by the content in the given contract clause.

---

Follow the following format.

Clause: ${{the given contract clause}}
Query: ${{a search query that can be answered by the information in the contract clause}}

---

Clause: “Customer Property” means all Intellectual Property, together with all materials, data, writings and other property in any form whatsoever, which is (a) owned or controlled by Customer or its Affiliates as of and following the Effective Date and (b) provided to Manufacturer by or on behalf of Customer or its Personnel under this Agreement. (Page 8)  Customer hereby grants to Manufacturer a non-exclusive license during the Term to use any Customer Property and Customer-Owned Improvements and Developments solely in connection with Manufacturer performing its obligations under this Agreement or the Facility Addendum in accordance with the terms hereof or thereof, as applicable (Page 59)
Query: highlight the parts (if any) of this contract related to "affiliate license-licensor" that should be reviewed by a lawyer. details: does the contract contain a license grant by affiliates of the licensor or that includes intellectual property of affiliates of the licensor?

---

Clause: Developer may not assign or transfer this Agreement, nor its rights and obligations hereunder, by operation of law or otherwise, to any third party without the prior express written approval of DSS. Any purported assignment without the consent of DSS shall be void. (Page 3)  The rights of Developer under this Agreement shall immediately cease and be terminated upon the sale or transfer of all or substantially all of the assets of Developer unless an assignment of such rights pursuant to such sale or transfer has been previously approved in writing by DSS. (Page 3)  The rights of Developer under this Agreement shall immediately cease and be terminated upon the sale or transfer of no less than a majority of, or a controlling interest in or over, the voting capital or ownership capital of Developer unless an assignment of such rights pursuant to such sale or transfer has been previously approved in writing by DSS. (Page 3)
Query: highlight the parts (if any) of this contract related to "anti-assignment" that should be reviewed by a lawyer. details: is consent or notice required of a party if the contract is assigned to a third party?

---

Clause: The cost of the audit will be borne by Licensor unless a discrepancy of more than five-percent (5%) is discovered, in which  case the cost of the audit shall be borne by Licensee. (Page 9)  Licensee agrees to allow an independent Certified Public Accountant or other Audit Professional, (selected by mutual  agreement) to audit and analyze appropriate accounting records to ensure compliance with all terms of this Agreement. (Page 9)  Any  such audit shall be permitted by Licensee within 30 days of Licensee’s receipt of a written request of Licensor. (Page 9)
Query: highlight the parts (if any) of this contract related to "audit rights" that should be reviewed by a lawyer. details: does a party have the right to audit the books, records, or physical locations of the counterparty to ensure compliance with the contract?

---

Clause: provided however, that Forty Niners SC shall be permitted to enter into a sponsorship agreement with any party that enters into a naming rights agreement with SCSA for the Stadium, provided that if SCSA enters into a naming rights agreement for the Stadium with a party that is in the Products and Services Category, Sponsor may immediately terminate this Agreement and receive a pro rated refund of any amounts paid by Sponsor for the unexpired Contract Year in which the termination occurs. (Page 4)  Sponsor acknowledges and agrees that, notwithstanding the grant of exclusivity set forth in this Section 4, Team shall have the right to solicit and enter into sponsorships with other parties that are not known primarily or exclusively as suppliers or providers of any product or service within the Product and Services Category. (Page 4)
Query: highlight the parts (if any) of this contract related to "competitive restriction exception" that should be reviewed by a lawyer. details: this category includes the exceptions or carveouts to non-compete, exclusivity and no-solicit of customers above.

---

Clause: The only reservation service or system you may use for outgoing reservations referred by or from the Hotel to other Network Hotels will be the Reservation Service or other reservation services we designate; (Page 13)  You must: <omitted> 5.1.13 not engage, directly or indirectly, in any cross-marketing or cross-promotion of the Hotel with any Other Hotel or related business, without our prior written consent. You agree to refer guests and customers, wherever reasonably possible, only to System Hotels or Network Hotels. (Page 13)  You must display all material, including brochures and promotional material we provide for System Hotels and Network Hotels, and allow advertising and promotion only of System Hotels and Network Hotels on the Hotel Site, unless we specifically direct you to include advertising or promotion of Other Hotels; (Page 13)
Query: highlight the parts (if any) of this contract related to "exclusivity" that should be reviewed by a lawyer. details: is there an exclusive dealing commitment with the counterparty? this includes a commitment to procure all “requirements” from one party of certain technology, goods, or services or a prohibition on licensing or selling technology, goods or services to third parties, or a prohibition on collaborating or working with other parties), whether during the contract or after the contract ends (or both).

---

Clause: The term of this Agreement starts on the Effective Date and, unless this Agreement is earlier terminated in accordance with its provisions, will expire ten (10) years from the Effective Date. (Page 5)
Query: highlight the parts (if any) of this contract related to "expiration date" that should be reviewed by a lawyer. details: on what date will the contract's initial term expire?

---

Clause: At least three (3) persons actively involved in the management and operation of the Franchised Restaurant must successfully complete the training program. (Page 5)  You will, on an annual basis, participate in a minimum of fifty percent (50%) of the promotional programs introduced by us from time to time. (Page 8)  At the time of opening you must have a minimum of Fifty Thousand Dollars ($50,000) in immediately accessible working capital funds to be used solely to defray the costs of operating the Restaurant for the initial several months. (Page 11)
Query: highlight the parts (if any) of this contract related to "minimum commitment" that should be reviewed by a lawyer. details: is there a minimum order size or minimum amount or units per-time period that one party must buy from the counterparty under the contract?

---

Clause: In consideration for such training, trade secrets and confidential information, you and your principals agree that during the term of this Agreement, and for a continuous uninterrupted period commencing upon expiration or termination of this Agreement, regardless of the cause for termination, and continuing for a period of three (3) years thereafter, neither you nor your principals shall, directly or indirectly, for themselves, or through, on behalf of, or in conjunction with any person, persons, partnership, limited liability company or corporation:   7.1.1 Divert or attempt to divert any business or customer of the Franchised Business or any Unit Franchisee anywhere, by direct or indirect inducement or otherwise, or do or perform, directly or indirectly, any other act injurious or prejudicial to the goodwill associated with our Proprietary Marks or the System. (Page 25)
Query: highlight the parts (if any) of this contract related to "no-solicit of customers" that should be reviewed by a lawyer. details: is a party restricted from contracting or soliciting customers or partners of the counterparty, whether during the contract or after the contract ends (or both)?

---

Clause: However, conduct which Company, in its discretion, deems detrimental to Company's image or reputation, shall be grounds for termination of this Agreement, upon reasonable notice and the failure to cure such behavior by Distributor. (Page 8)  Termination by Company under this Section 12.2 shall be effective sixty (60) days following Company's giving of notice to Distributor if the occurrence giving rise to the right of termination has not been cured, or immediately in the event of a breach of Section 6 regarding Non-Disclosure of Confidential Information or Section 7.1 regarding conduct injurious to Company's reputation. (Page 11)
Query: highlight the parts (if any) of this contract related to "non-disparagement" that should be reviewed by a lawyer. details: is there a requirement on a party not to disparage the counterparty?

---

Clause: Thereafter, this agreement will renew automatically from year to year unless cancelled in writing by either Party giving the other written notice of such cancellation a minimum of 60 days before the end of the then current term. (Page 6)
Query: highlight the parts (if any) of this contract related to "notice period to terminate renewal" that should be reviewed by a lawyer. details: what is the notice period required to terminate renewal?

---

Clause: Upon the expiration of the initial term of this  Agreement,  Franchisee shall  have the one time  right to obtain a  successor  franchise  to  operate a Pretzel  Time Unit at the Site (a  "Successor  Franchise")  for a single term of five (5) years  immediately  following the expiration of the initial term of the Franchise upon giving Pretzel Time six (6) months notice prior to the expiration of the then current term (Page 10)
Query: highlight the parts (if any) of this contract related to "renewal term" that should be reviewed by a lawyer. details: what is the renewal term after the initial term expires? this includes automatic extensions and unilateral extensions with prior notice.

---

Clause: In connection with the execution of this Agreement, Consultant and Company shall enter into a Restricted Stock Agreement. Subject to approval of the Board of Directors of the Company, the Company shall issue and sell to the Consultant, and the Consultant shall purchase from the Company, subject to the terms and conditions set forth in this Agreement and the Restricted Stock Agreement, 1,990,000 shares (the ‚ÄúShares‚Äù) of common stock, $0.0001 par value, of the Company (‚ÄúCommon Stock‚Äù), at a purchase price of $0.0001 per share, for an aggregate purchase price of $190. (Page 1)
Query: highlight the parts (if any) of this contract related to "revenue/profit sharing" that should be reviewed by a lawyer. details: is one party required to share revenue or profit with the counterparty for any technology, goods, or services?

---

Clause: The following events shall constitute "Source Code Access Conditions": (i) D2's insolvency, general assignment for the benefit of creditors, or ceasing to do business, or (ii) D2's failure or inability to meet its warranty, maintenance and support obligations under Article 6, or its warranty obligations under Article 8.3, within fifteen days after written notice by LICENSEE to D2 of D2's failure to meet such obligations, or (iii) termination of this Agreement by LICENSEE pursuant to Articles 9.3 and 9.4, or (iv) as needed by LICENSEE for fault isolation. (Page 4)
Query: highlight the parts (if any) of this contract related to "source code escrow" that should be reviewed by a lawyer. details: is one party required to deposit its source code into escrow with a third party, which can be released to the counterparty upon the occurrence of certain events (bankruptcy, insolvency, etc.)?

---

Clause: {clause}<|endofprompt|>
Query:'''

In [None]:
synthetic = []
max_n = len(atticus)
idx = [int(i) for i in np.random.choice(np.arange(max_n),size=30000,replace=False)]

for i in tqdm(idx):
    contract = [unicodedata.normalize("NFKD", clause).replace('\n',' ').strip() \
                for clause in atticus[i]['text'].split('\n\n') \
                if len(clause.split()) >= 40 and len(clause.split()) <= 150]
    if len(contract) >= 10:
        clause = np.random.choice(contract)
        prompt = generate_prompt_squad(clause)

        while True:
            try:
                response = get_openai_response(prompt)
            except:
                time.sleep(10)
                print('failed, trying again')
            else:
                break
        
        synthetic.append({'question':response,'clause':clause})
        
with open("synthetic_contracts_squad.jsonl", "w") as f:
    for entry in synthetic:
        json.dump(entry, f)
        f.write('\n')

In [None]:
synthetic = []

with open("synthetic_contracts_squad.jsonl", "r") as f:
    for line in tqdm(f, total=30000):

        entry = json.loads(line)
        clause = entry['clause']
        prompt = generate_prompt_cuad(clause)

        while True:
            try:
                response = get_openai_response(prompt)
            except:
                time.sleep(10)
                print('failed, trying again')
            else:
                break
                
        synthetic.append({'question':response,'clause':clause})
        
with open("synthetic_contracts_cuad.jsonl", "w") as f:
    for entry in synthetic:
        json.dump(entry, f)
        f.write('\n')

In [None]:
synthetic = []
total = 0
count = 0

for entry in tqdm(atticus):
    
    contract = [unicodedata.normalize("NFKD", clause).replace('\n',' ').strip() \
                for clause in entry['text'].split('\n\n') \
                if len(clause.split()) >= 30 and len(clause.split()) <= 150]

    for clause in contract:
        
        count += 1
        prompt_classify = classify_clause(clause)

        while True:
            try:
                response_classify = get_openai_response_short(prompt_classify)
                response_classify = response_classify.split('|')
                topic = response_classify[0]
            except:
                time.sleep(10)
                print('failed, trying again')
            else:
                break

        if topic in topics:
            clause_not_found = False
            prompt = generate_prompt_cuad(clause)

            while True:
                try:
                    response = get_openai_response(prompt)
                except:
                    time.sleep(10)
                    print('failed, trying again')
                else:
                    break

            synthetic.append({'question':response,'clause':clause,'clause_type':topic})
            total += 1
            
        if count % 200 == 0:
            print(count,total)
            
with open("synthetic_contracts_targeted.jsonl", "w") as f:
    for entry in synthetic:
        json.dump(entry, f)
        f.write('\n')