In [1]:
!pip install datasets




In [16]:
# Step 1: Install pyarrow if not already installed
!pip install pyarrow

# Step 2: Upload the parquet file
from google.colab import files
uploaded = files.upload()

# Step 3: Load parquet file to DataFrame
import pandas as pd

df = pd.read_parquet("claudette.parquet")  # Ensure this name matches what you uploaded
print(df.head())
print(df['label'].value_counts())  # Check class balance

# Step 4: Save as CSV
df.to_csv("claudette.csv", index=False)

# Step 5: Downloadable CSV link
files.download("claudette.csv")




Saving claudette.parquet to claudette (1).parquet
                                                text  label
0  thanks for sending us good vibes by using the ...      0
1  you may be surprised , but we will refer to al...      0
2  the terms of use -lrb- or , the `` terms '' -r...      0
3  the language of the terms will seem legal -lrb...      0
4  when you use our services , in addition to enj...      1
label
0    8382
1    1032
Name: count, dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
df.head()

Unnamed: 0,clause_text,risk_flag
0,thanks for sending us good vibes by using the ...,0
1,"you may be surprised , but we will refer to al...",0
2,"the terms of use -lrb- or , the `` terms '' -r...",0
3,the language of the terms will seem legal -lrb...,0
4,"when you use our services , in addition to enj...",1


In [20]:
risky_keywords_extended = [
    # ✂️ Termination
    "may terminate your account at any time",
    "termination without prior notice",
    "termination at sole discretion",

    # 🔐 Waivers & Liability
    "disclaims all warranties",
    "not responsible for any damages",
    "waives the right to sue",
    "you agree to hold us harmless",
    "indemnify us against any claims",

    # 💸 Payment & Fees
    "non-refundable fee",
    "you will be charged automatically",
    "early termination fee applies",
    "fees may change without notice",

    # ⚖️ Arbitration & Jurisdiction
    "binding arbitration is required",
    "you waive your right to a jury trial",
    "all disputes will be resolved by arbitration",
    "jurisdiction shall be in",
    "governing law is",

    # 🔁 Auto-renewal / Hidden Continuity
    "automatic renewal applies",
    "renews unless canceled",
    "recurring charges will apply",

    # 📜 IP Rights / Content Ownership
    "you grant us a worldwide license",
    "we own all user content",
    "you transfer all rights to us",
    "you waive moral rights",

    # 🕵️ Data Usage / Surveillance
    "we may collect your data without notice",
    "you consent to data sharing with partners",
    "we may monitor your activity",

    # 🧱 One-sided Changes
    "we may modify these terms at any time",
    "terms may be updated without notice",
    "subject to change without your approval",

    # 🕓 Perpetual Terms
    "this license is irrevocable",
    "agreement lasts indefinitely",
]


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV
df = pd.read_csv("claudette.csv")

# Optional: Rename if needed
df.rename(columns={"text": "clause_text", "label": "risk_flag"}, inplace=True)

# Check distribution
print(df['risk_flag'].value_counts())

# Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['clause_text'], df['risk_flag'], test_size=0.2, stratify=df['risk_flag'], random_state=42
)


risk_flag
0    8382
1    1032
Name: count, dtype: int64


In [22]:
!pip install transformers

from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)




In [23]:
import torch

class ClauseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # 'labels' key is required
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClauseDataset(train_encodings, train_labels.tolist())
test_dataset = ClauseDataset(test_encodings, test_labels.tolist())


In [24]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=50,
    weight_decay=0.01,
    logging_steps=10,
)

# Optional: define compute_metrics function
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mahinaganguly05[0m ([33mahinaganguly05-nit-durgapur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.7089
20,0.4933
30,0.3473
40,0.2948
50,0.3349
60,0.3617
70,0.2429
80,0.441
90,0.3251
100,0.2886


TrainOutput(global_step=2355, training_loss=0.09417433798147046, metrics={'train_runtime': 2009.6629, 'train_samples_per_second': 18.737, 'train_steps_per_second': 1.172, 'total_flos': 4988059896391680.0, 'train_loss': 0.09417433798147046, 'epoch': 5.0})

In [28]:
trainer.evaluate()


{'eval_loss': 0.24625876545906067,
 'eval_accuracy': 0.9559214020180563,
 'eval_f1': 0.7798408488063661,
 'eval_precision': 0.8596491228070176,
 'eval_recall': 0.7135922330097088,
 'eval_runtime': 26.3066,
 'eval_samples_per_second': 71.579,
 'eval_steps_per_second': 2.243,
 'epoch': 5.0}

In [30]:
import torch
from transformers import DistilBertTokenizerFast

# Extreme sentences
test_sentences = [
    # 🚨 Clearly RISKY clause
    "We may terminate your account at any time without notice and are not responsible for any damages caused.",

    # ✅ Clearly SAFE clause
    "You may update your email preferences at any time from your account settings."
]

# Load tokenizer and prepare device
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize
inputs = tokenizer(test_sentences, truncation=True, padding=True, return_tensors="pt")
inputs = {key: val.to(device) for key, val in inputs.items()}

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

# Show results
for sentence, label in zip(test_sentences, predictions):
    print(f"➡️ '{sentence}'\n   🔎 Prediction: {'RISKY' if label.item() == 1 else 'SAFE'}\n")


➡️ 'We may terminate your account at any time without notice and are not responsible for any damages caused.'
   🔎 Prediction: RISKY

➡️ 'You may update your email preferences at any time from your account settings.'
   🔎 Prediction: SAFE



In [31]:
# Save the model and tokenizer to a directory
model.save_pretrained("bert_risk_model")
tokenizer.save_pretrained("bert_risk_model")


('bert_risk_model/tokenizer_config.json',
 'bert_risk_model/special_tokens_map.json',
 'bert_risk_model/vocab.txt',
 'bert_risk_model/added_tokens.json',
 'bert_risk_model/tokenizer.json')

In [32]:
!zip -r bert_risk_model.zip bert_risk_model


  adding: bert_risk_model/ (stored 0%)
  adding: bert_risk_model/tokenizer_config.json (deflated 75%)
  adding: bert_risk_model/config.json (deflated 45%)
  adding: bert_risk_model/vocab.txt (deflated 53%)
  adding: bert_risk_model/tokenizer.json (deflated 71%)
  adding: bert_risk_model/special_tokens_map.json (deflated 42%)
  adding: bert_risk_model/model.safetensors (deflated 8%)


In [34]:
from google.colab import files
files.download("bert_risk_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>