In [None]:
!pip install --upgrade transformers

In [None]:
# !rm -rf /kaggle/working/*

In [None]:
import json
from datasets import Dataset, DatasetDict
import pandas as pd

# Load CUAD JSON
with open('/kaggle/input/cuad-contract-understanding-atticus-dataset/CUAD_v1/CUAD_v1/CUAD_v1.json', 'r') as f:
    data = json.load(f)

# Extract all QA pairs
examples = []
for contract in data['data']:
    for para in contract['paragraphs']:
        context = para['context']
        for qa in para['qas']:
            examples.append({
                'id': qa['id'],
                'title': contract['title'],
                'context': context,
                'question': qa['question'],
                'answers': qa['answers']
            })

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(pd.DataFrame(examples))

# Train-Validation Split (80-20 by contracts)
titles = list(set(ex['title'] for ex in examples))
split_idx = int(0.8 * len(titles))
train_titles = set(titles[:split_idx])

def split_by_title(example):
    return example['title'] in train_titles

train_dataset = dataset.filter(split_by_title)
val_dataset = dataset.filter(lambda x: not split_by_title(x))

In [None]:
train_dataset = train_dataset.remove_columns(['id', 'title'])
val_dataset = val_dataset.remove_columns(['id', 'title'])

In [None]:
import re
from bs4 import BeautifulSoup

def clean_context(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_question(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning
def preprocess_dataset(dataset):
    return dataset.map(lambda x: {
        "context": clean_context(x["context"]),
        "question": clean_question(x["question"]),
        "answers": x["answers"]
    })

train_dataset = preprocess_dataset(train_dataset)
val_dataset = preprocess_dataset(val_dataset)


In [None]:
train_dataset[0]

In [None]:
from transformers import AutoTokenizer

model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def prepare_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        # Map back to original example
        sample_idx = sample_mapping[i]

        # answers is a list of dicts
        answers = examples["answers"][sample_idx]

        if len(answers) == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        # CUAD has one answer per question
        answer = answers[0]
        start_char = answer["answer_start"]
        answer_text = answer["text"]
        end_char = start_char + len(answer_text)

        sequence_ids = tokenized_examples.sequence_ids(i)

        # Find start of context
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        # Find end of context
        token_end_index = len(offsets) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # If answer is outside the span
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Move start index to start_char
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)

            # Move end index to end_char
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples


train_dataset = train_dataset.map(
    prepare_features,
    batched=True,
    remove_columns=train_dataset.column_names
)

val_dataset = val_dataset.map(
    prepare_features,
    batched=True,
    remove_columns=val_dataset.column_names
)


In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./result',
    num_train_epochs=3,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    disable_tqdm=False,
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_data,
    eval_dataset=encoded_val_data
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("fine-tuned-legal-bert")
tokenizer.save_pretrained("fine-tuned-legal-bert")