In [3]:
import torch

print(f"PyTorch version: {torch.version}")
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

PyTorch version: <module 'torch.version' from '/usr/local/lib/python3.10/dist-packages/torch/version.py'>
CUDA available: True
Using device: cuda


In [1]:
import torch
import torch.nn as nn
import transformers
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from datasets import load_dataset  # Example: HuggingFace Datasets
import numpy as np


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 1053, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.p

In [14]:
from transformers import AutoModel, AutoTokenizer  # Updated imports
import torch.nn as nn

# Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
legal_bert = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Model Definition
class LegalBERT_BiGRU(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.legal_bert = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")  # Updated
        self.bigru = nn.GRU(
            input_size=768,
            hidden_size=128,
            bidirectional=True,
            batch_first=True
        )
        self.classifier = nn.Linear(128 * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.legal_bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        gru_out, _ = self.bigru(last_hidden_state)
        logits = self.classifier(gru_out[:, -1, :])
        return logits

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LegalBERT_BiGRU(num_classes=2).to(device)

In [38]:
sample_text = """
Police covertly installed CCTV cameras in a suspect's home for 6 months without judicial approval. The applicant 
argued this breached Article 8 (right to privacy). The government admitted the surveillance lacked proper 
authorization but claimed it was necessary to combat organized crime.
"""
sample_label = 0  # 1 = "violation", 0 = "no violation"

In [39]:
inputs = tokenizer(
    sample_text, 
    padding="max_length", 
    truncation=True, 
    max_length=512, 
    return_tensors="pt"
).to(device)  # Move to GPU if available

In [40]:
model.eval()
with torch.no_grad():
    logits = model(inputs["input_ids"], inputs["attention_mask"])
    pred = torch.argmax(logits).item()

print("Predicted:", "Violation" if pred == 1 else "No Violation")

Predicted: No Violation


In [22]:
# Tokenize the texts
def tokenize(texts, max_len=512):
    return tokenizer(
        texts, 
        padding="max_length", 
        truncation=True, 
        max_length=max_len, 
        return_tensors="pt"
    )


In [None]:
from sklearn.model_selection import train_test_split

# Split data into train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)


train_encodings = tokenize(train_texts)
test_encodings = tokenize(test_texts)

NameError: name 'texts' is not defined

In [None]:
from torch.utils.data import Dataset, DataLoader

class LegalDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

train_dataset = LegalDataset(train_encodings, train_labels)
test_dataset = LegalDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [None]:
import torch.optim as optim

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, dataloader, optimizer, loss_fn, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

train(model, train_loader, optimizer, loss_fn, epochs=3)

NameError: name 'model' is not defined

In [None]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")

evaluate(model, test_loader)

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        logits = model(**inputs)
    pred = torch.argmax(logits).item()
    return "Violation" if pred == 1 else "No Violation"

# Example
case_text = "The applicant alleged a violation of Article 6 of the Convention..."
print(predict(case_text))