In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = "ILDC_multi/ILDC_multi.csv"

# Load the CSV file, including the 'split' column
dtype = {'text': str, 'label': int, 'split': str}
df = pd.read_csv(csv_path, usecols=['text', 'label', 'split', 'name'], low_memory=False, dtype=dtype)
6
# Use only a random subset (e.g., 1000 rows)
df = df.sample(n=400, random_state=42)

# Filter for the train split
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']


# Split train_df into train and dev (e.g., 80% train, 20% dev)
train_split_df, dev_split_df = train_test_split(
    train_df,
    test_size=0.2,      # 20% for dev
    random_state=42,    # for reproducibility
    stratify=train_df['label']  # stratify to keep label distribution
)

# Extract texts and labels for training
train_texts = train_split_df['text'].tolist()
train_labels = train_split_df['label'].tolist()
train_labels = torch.tensor(train_labels, dtype=torch.long)

# Extract texts and labels for dev
dev_texts = dev_split_df['text'].tolist()
dev_labels = dev_split_df['label'].tolist()
dev_labels = torch.tensor(dev_labels, dtype=torch.long)

# Extract texts and labels for testing
test_texts = test_df['text'].tolist()
test_labels = test_df['label'].tolist()
test_labels = torch.tensor(test_labels, dtype=torch.long)

In [87]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load InLegalBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")
model.eval()  # Set to evaluation mode

print("loading done")

loading done


In [88]:
def chunk(text, max_length=512, stride=256):
    """
    Split a long document into chunks of max_length tokens using InLegalBERT tokenizer with overlap.
    Args:
        text: Input text to be chunked.
        max_length: Maximum number of tokens per chunk.
        stride: Number of tokens to overlap between chunks.
    Returns:
        List of token ID lists, each representing a chunk.
    """
    tokens = tokenizer.encode(text, add_special_tokens=True)
    chunks = []
    for i in range(0, len(tokens), max_length - stride):
        chunk = tokens[i: i + max_length]
        chunks.append(chunk)
    return chunks



In [89]:
def get_vector(text):
    """
    Generate [CLS] embeddings for each chunk of the input text using InLegalBERT.
    Args:
        text: Input text to generate embeddings for.
    Returns:
        Tensor of shape (num_chunks, 768) containing [CLS] embeddings for each chunk.
    """
    chunks = chunk(text)
    cls_vectors = []

    for chunk_ids in chunks:
        print(f"chunk_ids length: {len(chunk_ids)}")
        input_ids = torch.tensor([chunk_ids])  # shape: [1, seq_len]
        attention_mask = torch.ones_like(input_ids)  # shape: [1, seq_len]

        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

        with torch.no_grad():
            outputs = model(**inputs)
            cls_vector = outputs.last_hidden_state[:, 0, :]  # shape: [1, 768]
            cls_vectors.append(cls_vector.squeeze(0))  # shape: [768]

    # Stack all [CLS] vectors into a single tensor
    doc_tensor = torch.stack(cls_vectors)  # shape: [num_chunks, 768]
    print("Document tensor shape:", doc_tensor.shape)
    return doc_tensor


In [90]:
train_vectors = []
for item in train_texts:
    doc_tensor = get_vector(item)        # shape = [num_chunks, 768]
    pooled     = doc_tensor.mean(0)      # shape = [768]  ← NEW
    train_vectors.append(pooled)

# Stack vectors: shape (num_docs, num_chunks, 768)
X_train = torch.stack(train_vectors)  # shape: [4, num_chunks, 768]

# Save to workspace
file_path = "data_train.pt"
torch.save({'X': X_train, 'y': train_labels}, file_path)
print("Saved training data:", file_path)


test_vectors = []
for item in test_texts:
    doc_tensor = get_vector(item)        # shape = [num_chunks, 768]
    pooled     = doc_tensor.mean(0)      # shape = [768]  ← NEW
    test_vectors.append(pooled)

# Stack vectors: shape (num_docs, num_chunks, 768)
X_test = torch.stack(test_vectors)  # shape: [3, num_chunks, 768]

# Save to workspace
file_path = "data_test.pt"
torch.save({'X': X_train, 'y': train_labels}, file_path)
print("Saved testing data:", file_path)

dev_vectors = []
for item in dev_texts:
    doc_tensor = get_vector(item)        # shape = [num_chunks, 768]
    pooled     = doc_tensor.mean(0)      # shape = [768]  ← NEW
    dev_vectors.append(pooled)

# Stack vectors: shape (num_docs, num_chunks, 768)
X_dev = torch.stack(dev_vectors)  # shape: [3, num_chunks, 768]

# Save to workspace
file_path = "data_dev.pt"
torch.save({'X': X_dev, 'y': dev_labels}, file_path)
print("Saved dev data:", file_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (4142 > 512). Running this sequence through the model will result in indexing errors


chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 302
chunk_ids length: 46
Document tensor shape: torch.Size([17, 768])
chunk_ids length: 312
chunk_ids length: 56
Document tensor shape: torch.Size([2, 768])
chunk_ids length: 317
chunk_ids length: 61
Document tensor shape: torch.Size([2, 768])
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 512
chunk_ids length: 333
chunk_ids length: 77
Document tensor shape: torch.Size([