In [1]:
from datasets import load_dataset as lds
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from dataset.loader import ContractNLIExample
import json

In [12]:
# Load dataset and model
dataset = ContractNLIExample.load(json.load(open('../dataset/contract-nli/dev.json','r')))
model_name = "textattack/bert-base-uncased-snli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)



OSError: zlucia/roberta-large-contract-nli is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [3]:
# Preprocess the data
def preprocess_function(examples):
    texts = (examples['premise'], examples['hypothesis'])
    return tokenizer(*texts, truncation=True, padding='max_length', max_length=128)
ds = lds("snli")
encoded_dataset = ds.map(preprocess_function, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

In [4]:
def filter_no_label(example):
    return example['label'] != -1

encoded_dataset = encoded_dataset.filter(filter_no_label)


Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

In [7]:
# tokenized_ds = ds.map(tokenize_function, batched=True)
# tokenized_ds.set_format("torch", columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])


NameError: name 'tokenize_function' is not defined

In [9]:
# Create a DataLoader for batching
batch_size = 32
val_dataloader = DataLoader(encoded_dataset['validation'], batch_size=batch_size)


In [10]:
# Initialize containers
all_labels = []
all_preds = []

# Move model to device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Evaluate
model.eval()
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Evaluating"):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get predictions
        preds = torch.argmax(logits, dim=-1)

        # Collect predictions and labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='weighted')

print(f'Validation Accuracy: {accuracy:.4f}')
print(f'Validation F1 Score: {f1:.4f}')


Evaluating:   0%|          | 0/308 [00:00<?, ?it/s]

Validation Accuracy: 0.2232
Validation F1 Score: 0.2040


In [10]:
def evaluate(model, dataloader, original_dataset):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    all_predictions = []
    all_labels = []
    
    label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
    
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc="Evaluating")
    
    with torch.no_grad():
        for batch_idx, batch in progress_bar:
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'token_type_ids': batch['token_type_ids'].to(device),
            }
            labels = batch['labels'].to(device)
            
            outputs = model(**inputs)
            logits = outputs.logits
            
            batch_preds = torch.argmax(logits, dim=-1)
            
            all_predictions.extend(batch_preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
            # Calculate live metrics
            current_accuracy = accuracy_score(all_labels, all_predictions)
            current_f1 = f1_score(all_labels, all_predictions, average='weighted')
            
            progress_bar.set_postfix({'Accuracy': f'{current_accuracy:.4f}', 'F1': f'{current_f1:.4f}'})
            
            # Print examples for the first batch
            if batch_idx == 0:
                for j in range(min(10, len(batch['input_ids']))):
                    idx = batch_idx * dataloader.batch_size + j
                    if idx < len(original_dataset):
                        premise = original_dataset[idx]['premise']
                        hypothesis = original_dataset[idx]['hypothesis']
                        true_label = label_map.get(original_dataset[idx]['label'], "unknown")
                        pred_label = label_map.get(batch_preds[j].item(), "unknown")
                        
                        print(f"\nExample {idx + 1}:")
                        print(f"Premise: {premise}")
                        print(f"Hypothesis: {hypothesis}")
                        print(f"True label: {true_label}")
                        print(f"Predicted label: {pred_label}")
                        print(f"Logits: {logits[j].cpu().tolist()}")
                        print("-" * 50)
    
    # Calculate final metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    f1_macro = f1_score(all_labels, all_predictions, average='macro')
    f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
    
    print(f"\nFinal Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score (Macro): {f1_macro:.4f}")
    print(f"F1 Score (Weighted): {f1_weighted:.4f}")
    
    return all_predictions, all_labels


In [11]:
predictions, true_labels = evaluate(model, test_dataloader, ds["test"])


Evaluating:   1%|          | 4/614 [00:00<00:40, 15.15it/s, Accuracy=0.0312, F1=0.0317]


Example 1:
Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: The church has cracks in the ceiling.
True label: neutral
Predicted label: entailment
Logits: [2.316099166870117, -4.920962810516357, 1.8545359373092651]
--------------------------------------------------

Example 2:
Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: The church is filled with song.
True label: entailment
Predicted label: neutral
Logits: [-4.227248191833496, 3.418980836868286, -0.033301327377557755]
--------------------------------------------------

Example 3:
Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: A choir singing at a baseball game.
True label: contradiction
Predicted label: entailment
Logits: [5.989924430847168, -5.183979511260986, -1.0185179710388184]
--------------------------------------------------

Example 4:

Evaluating:  10%|█         | 62/614 [00:03<00:34, 16.13it/s, Accuracy=0.0514, F1=0.0508]


RuntimeError: stack expects each tensor to be equal size, but got [76] at entry 0 and [69] at entry 8

In [24]:
# save model
# model.save_pretrained("debertv3")

In [None]:
print(model.config)
