In [6]:
# install pandas using pip
!pip install pandas

Collecting pandas
  Downloading pandas-3.0.1-cp314-cp314-macosx_11_0_arm64.whl.metadata (79 kB)
Collecting numpy>=2.3.3 (from pandas)
  Downloading numpy-2.4.2-cp314-cp314-macosx_14_0_arm64.whl.metadata (6.6 kB)
Downloading pandas-3.0.1-cp314-cp314-macosx_11_0_arm64.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m574.6 kB/s[0m  [33m0:00:17[0mm0:00:01[0m00:01[0m
[?25hDownloading numpy-2.4.2-cp314-cp314-macosx_14_0_arm64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m1.1 MB/s[0m  [33m0:00:04[0m eta [36m0:00:01[0m0m
[?25hInstalling collected packages: numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pandas]2m1/2[0m [pandas]
[1A[2KSuccessfully installed numpy-2.4.2 pandas-3.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[

In [7]:
import json
import os
import pandas as pd
import re # used in text cleaning and preprocessing.

**Risk Mapping Section**

This is the core design decision of our project.

You are converting 41 clause types into 3 risk categories.

In [8]:
HIGH_RISK = {
    'Cap On Liability', 'Uncapped Liability', 'Liquidated Damages', 
    'Termination For Convenience', 'Change Of Control', 
    'Non-Compete', 'Exclusivity', 'No-Solicit Of Customers', 
    'No-Solicit Of Employees', 'Ip Ownership Assignment', 
    'Joint Ip Ownership', 'Indemnification' # Added just in case, though not found in inspection
}

# Medium Risk: Operational constraints, financial terms, assignments
MEDIUM_RISK = {
    'Anti-Assignment', 'Revenue/Profit Sharing', 'Price Restrictions', 
    'Minimum Commitment', 'Audit Rights', 'Insurance', 
    'Most Favored Nation', 'Competitive Restriction Exception', 
    'Non-Disparagement', 'Non-Transferable License', 
    'Affiliate License-Licensor', 'Affiliate License-Licensee', 
    'Source Code Escrow', 'Post-Termination Services', 
    'Warranty Duration', 'Third Party Beneficiary', 'Covenant Not To Sue'
}

# Low Risk: Standard boilerplates, definitions, dates
LOW_RISK = {
    'Governing Law', 'Agreement Date', 'Effective Date', 
    'Expiration Date', 'Renewal Term', 'Notice Period To Terminate Renewal', 
    'Parties', 'Document Name', 'Unlimited/All-You-Can-Eat-License', 
    'Irrevocable Or Perpetual License', 'Volume Restriction'
}

In [9]:
def get_risk_level(clause_type): #This function maps a clause type → risk label.
    if clause_type in HIGH_RISK:
        return 'High'
    elif clause_type in MEDIUM_RISK:
        return 'Medium'
    elif clause_type in LOW_RISK:
        return 'Low'
    else:
        return 'Unknown' # Should handle or filter these

This prepares raw legal clause text for machine learning.

In [10]:
def clean_text(text): 
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special symbols (keeping basic punctuation for readability if needed)
    # Keeping alphanumeric and basic punctuation for context
    text = re.sub(r'[^a-z0-9\s.,;:\-\'\"()/]', '', text)
    #Remove special symbols
    return text

In [13]:
def process_cuad(input_file, output_file):

    with open(input_file, 'r') as f:
        data = json.load(f) # Loads CUAD into Python dictionary.

    rows = [] # making each row as trainig example, and storing in list of dicts.

    for doc in data['data']:
        title = doc.get('title', 'Unknown')
        for para in doc.get('paragraphs', []):
            context = para.get('context', '') # Full contract text if needed
            for qa in para.get('qas', []):
                qa_id = qa.get('id', '')

                if '__' not in qa_id:
                    continue
                
                clause_type = qa_id.split('__')[-1]
                
                # Check for answers
                if qa.get('is_impossible', False): # If question has no valid answer → skip.
                    continue
                
                answers = qa.get('answers', [])
                for ans in answers:
                    text = ans.get('text', '')
                    if text:
                        risk = get_risk_level(clause_type) # Maps clause type → High / Medium / Low / Unknown.
                        
                        if risk != 'Unknown': # We drop unknown to enforce 3-class classification.
                            rows.append({
                                'clause_text': text,
                                'clause_type': clause_type,
                                'risk_level': risk,
                                'cleaned_text': clean_text(text),
                                'source_doc': title
                            })

    df = pd.DataFrame(rows) #Converts list of dictionaries → table format.
    print(f"Extracted {len(df)} clauses.")
    print("Risk Distribution:")
    print(df['risk_level'].value_counts())
    
    # Save
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    #! Creates directory if it doesn't exist.
    #! exist_ok=True prevents crash if folder already exists.

    df.to_csv(output_file, index=False)
    print(f"Saved to {output_file}")



if __name__ == "__main__":
    try:
        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    except NameError:
        base_dir = os.path.dirname(os.getcwd()) #Gets current file location
    input_path = os.path.join(base_dir, "CUAD_Dataset", "CUAD_v1.json")
    output_path = os.path.join(base_dir, "data", "processed", "processed_contracts.csv")
    
    process_cuad(input_path, output_path)

Extracted 12679 clauses.
Risk Distribution:
risk_level
Low       5629
Medium    4340
High      2710
Name: count, dtype: int64
Saved to /Users/ash/CascadeProjects/projects..../contrack_risk_analyser/data/processed/processed_contracts.csv
