In [None]:
pip install transformers



In [None]:

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Load the CSV file and preprocess the data
csv_file_path = "/content/overruling.csv"
df = pd.read_csv(csv_file_path)

text_data = df['sentence'].values
labels = df['label'].values

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    text_data, labels, test_size=0.2, random_state=42
)

# Load Legal-BERT tokenizer
model_name = "/content/drive/MyDrive/legal-bert"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

# Tokenize and encode the training data
train_encodings = tokenizer(list(train_texts),
                            padding=True,
                            truncation=True,
                            max_length=128,
                            add_special_tokens=True,
                            return_tensors='pt')

# Tokenize and encode the test data
test_encodings = tokenizer(list(test_texts),
                           padding=True,
                           truncation=True,
                           max_length=128,
                           add_special_tokens=True,
                           return_tensors='pt')

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Load the pre-trained Legal-BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Create a DataLoader for training
batch_size = 64
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_function = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for i, batch in enumerate(train_loader):
        input_ids, attention_mask, batch_labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print progress after every batch
        if (i + 1) % 10 == 0:  # You can adjust the frequency of printing
            print(f"Epoch [{epoch+1}/{epochs}], Iteration [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1}, Average Loss: {total_loss / len(train_loader)}")

    # Evaluation after each epoch
    model.eval()
    with torch.no_grad():
        test_outputs = model(input_ids=test_encodings['input_ids'],
                             attention_mask=test_encodings['attention_mask'])
        predicted_labels = torch.argmax(test_outputs.logits, dim=1)
        accuracy = (predicted_labels == test_labels).sum().item() / len(test_labels)

    print(f"Epoch {epoch+1}, Accuracy: {accuracy:.4f}")

print("Training complete.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/bert-double and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
