### Evidence Identification Model

Inputs: 
`(any) span` + `[sep]` + `hyp`  


In [3]:
import torch
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import wandb 
import csv
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# from transformers import DistilBertTokenizer
import numpy as np
import torch.nn as nn
# import torch.optim as optim
# from tabulate import tabulate
# from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

In [4]:
from loader import ContractNLIExample

In [5]:
# entailment & contradiction are evidence, while not_mentioned is no evidence
ENTAILMENT = 0
CONTRADICTION = 0
NOT_MENTIONED = 1

In [6]:
def get_label(label):
    label = str(label)
    if label == "NLILabel.ENTAILMENT":
        return ENTAILMENT
    elif label == "NLILabel.CONTRADICTION":
        return CONTRADICTION
    elif label == "NLILabel.NOT_MENTIONED":
        return NOT_MENTIONED

In [7]:
def json_to_csv(json_file, csv_file):
    
    examples = ContractNLIExample.load(json.load(open(json_file, 'r', encoding='utf-8')))

    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['span', 'hypothesis', 'evidence_label'])   # Write the header row

        # Iterate over each document
        for example in examples:
            doc_id = example.document_id  # Get the document ID
            label = get_label(example.label)     # Get the 'label' object
            hypothesis = example.hypothesis_text  # Get the 'hypothesis' object
            context_text = example.context_text  # Get the 'context' object
            
            spans = example.spans  # Get the 'spans' array
            all_spans_text = [context_text[start:end] for start, end in spans]  # Get the text of all spans
            annotated_text = [all_spans_text[i] for i in example.annotated_spans]  # Get the 'annotations' object
            
            filtered_all_spans = [item for idx, item in enumerate(all_spans_text) if idx not in example.annotated_spans]
            
            # write the regular spans
            for span in filtered_all_spans:
                writer.writerow([span, hypothesis, label])  # Write the row to the CSV file
                
            # write the evidence spans
            for span in annotated_text:
                writer.writerow([span, hypothesis, label])

    print(f"Conversion complete. CSV file '{csv_file}' created.")

In [8]:
json_folder = "../dataset/contract-nli"
csv_folder = "../dataset/csv/EI"

In [9]:
# json_to_csv(f'{json_folder}/train.json', f'{csv_folder}/EI_train.csv')
# json_to_csv(f'{json_folder}/test.json', f'{csv_folder}/EI_test.csv')
# json_to_csv(f'{json_folder}/dev.json', f'{csv_folder}/EI_validation.csv')

all_train_df = pd.read_csv(f'{csv_folder}/EI_train.csv')
all_val_df = pd.read_csv(f'{csv_folder}/EI_validation.csv')
all_test_df = pd.read_csv(f'{csv_folder}/EI_test.csv')

In [10]:
def stratified_sample(
    df,
    evidence_fraction=0.01,
    not_evidence_fraction=0.01,
    label_column="evidence_label",
    random_state=42,
):
    # Separate the DataFrame into groups based on the label column
    evidence_group = df[df[label_column] == 0]
    not_evidence_group = df[df[label_column] == 1]

    # Sample each group
    sampled_evidence = evidence_group.sample(
        frac=evidence_fraction, random_state=random_state
    )
    sampled_not_evidence = not_evidence_group.sample(
        frac=not_evidence_fraction, random_state=random_state
    )

    # Combine the sampled groups and reset the index
    sampled_df = pd.concat([sampled_evidence, sampled_not_evidence]).reset_index(drop=True)

    return sampled_df

In [11]:
evidence_fraction=0.01
not_evidence_fraction=0.01

train_df = stratified_sample(
    all_train_df, evidence_fraction=evidence_fraction, not_evidence_fraction=not_evidence_fraction
)
val_df = stratified_sample(all_val_df)
test_df = stratified_sample(all_test_df)

In [12]:
# get the stats for 'labels' in the training set
train_df['evidence_label'].value_counts()
# val_df['evidence_label'].value_counts().plot(kind='bar')
# test_df['evidence_label'].value_counts().plot(kind='bar')

evidence_label
0    3614
1    1978
Name: count, dtype: int64

In [30]:
model_name = "FacebookAI/roberta-base"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# print all trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad and not ("classifier" in name):
        param.requires_grad = False

In [15]:
# check context len
# model_name2.config.max_position_embeddings
max_length = tokenizer.model_max_length
print(max_length)

512


In [16]:
train_labels = train_df['evidence_label']
valid_labels = val_df['evidence_label']
test_labels = test_df['evidence_label']

In [17]:
def tokenize_data(data):
    return tokenizer(text=data['span'].tolist(), text_pair=data['hypothesis'].tolist(), truncation=True, padding="max_length", max_length=max_length)

In [18]:
class ContractNLIDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {}

        for key, value in self.embeddings.items():
            element_at_idx = value[idx]
            tensor_at_idx = torch.tensor(element_at_idx)
            item[key] = tensor_at_idx

        label_at_idx = self.labels[idx]
        label_tensor = torch.tensor(int(label_at_idx))
        item["labels"] = label_tensor

        return item

In [19]:
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(val_df)
test_encodings = tokenize_data(test_df)

train_dataset = ContractNLIDataset(train_encodings, train_labels)
valid_dataset = ContractNLIDataset(valid_encodings, valid_labels)

In [20]:
def metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    accuracy = accuracy_score(p.label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        p.label_ids, preds, average="weighted", zero_division=1  # Add zero_division here
    )
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [21]:
wandb.init(project="contract-nli")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjhalak151[0m ([33mjhalak-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
training_args = TrainingArguments(
    output_dir='./bert_base_evidence_small',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    report_to="wandb",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=metrics,
)

In [23]:
# Train the model
train_results = trainer.train()

# Evaluate the model on the validation dataset
eval_results = trainer.evaluate()

print("Training complete.")



  0%|          | 0/699 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.6884, 'grad_norm': 2.3267569541931152, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 0.704, 'grad_norm': 2.0237112045288086, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.03}
{'loss': 0.6937, 'grad_norm': 4.154807090759277, 'learning_rate': 3e-06, 'epoch': 0.04}
{'loss': 0.6932, 'grad_norm': 2.607322931289673, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.06}
{'loss': 0.7005, 'grad_norm': 3.1149163246154785, 'learning_rate': 5e-06, 'epoch': 0.07}
{'loss': 0.6717, 'grad_norm': 4.681314945220947, 'learning_rate': 6e-06, 'epoch': 0.09}
{'loss': 0.6357, 'grad_norm': 3.363022565841675, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.1}
{'loss': 0.6351, 'grad_norm': 2.742649793624878, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.11}
{'loss': 0.6168, 'grad_norm': 7.802003383636475, 'learning_rate': 9e-06, 'epoch': 0.13}
{'loss': 0.7109, 'grad_norm': 3.850588798522949, 'learning_rate': 1e-05, 'epoch': 0.14}
{'loss': 0.6107, 'grad_norm': 8.16097

  0%|          | 0/109 [00:00<?, ?it/s]

{'eval_loss': 0.5845615863800049, 'eval_accuracy': 0.7289504036908881, 'eval_precision': 0.7216540045902768, 'eval_recall': 0.7289504036908881, 'eval_f1': 0.7193378215484377, 'eval_runtime': 27.624, 'eval_samples_per_second': 31.386, 'eval_steps_per_second': 3.946, 'epoch': 1.0}
{'train_runtime': 550.792, 'train_samples_per_second': 10.153, 'train_steps_per_second': 1.269, 'train_loss': 0.6075321308021382, 'epoch': 1.0}


  0%|          | 0/109 [00:00<?, ?it/s]

Training complete.


In [24]:
class ContractNLIDatasetTest(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


test_encodings = tokenize_data(test_df)
test_dataset = ContractNLIDatasetTest(test_encodings)

In [25]:
# Use the Trainer.predict() method to get predictions
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=1)


confusion_mat = confusion_matrix(test_labels, pred_labels)
print("Confusion Matrix:")
print(confusion_mat)


# Compute evaluation metrics
accuracy = accuracy_score(test_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(
    test_labels, pred_labels, average="weighted"
)

print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

class_names = ["EVIDENCE", "NOT-EVIDENCE"]
print("\nClassification Report:")
print(classification_report(test_labels, pred_labels, target_names=class_names))

# Error analysis

incorrect_predictions = []
for i in range(len(test_labels)):
    if test_labels[i] != pred_labels[i]:
        incorrect_predictions.append(i)

print(
    f"Number of incorrect predictions: {len(incorrect_predictions)} out of {len(test_labels)}")

  0%|          | 0/214 [00:00<?, ?it/s]

Confusion Matrix:
[[865 144]
 [361 340]]
Test Accuracy: 0.7046783625730995
Precision: 0.7042891389601819
Recall: 0.7046783625730995
F1-score: 0.6919750077957806

Classification Report:
              precision    recall  f1-score   support

    EVIDENCE       0.71      0.86      0.77      1009
NOT-EVIDENCE       0.70      0.49      0.57       701

    accuracy                           0.70      1710
   macro avg       0.70      0.67      0.67      1710
weighted avg       0.70      0.70      0.69      1710

Number of incorrect predictions: 505 out of 1710


In [26]:
wandb.finish()

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▁
eval/f1,▁▁
eval/loss,▁▁
eval/precision,▁▁
eval/recall,▁▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
test/runtime,▁
test/samples_per_second,▁

0,1
eval/accuracy,0.72895
eval/f1,0.71934
eval/loss,0.58456
eval/precision,0.72165
eval/recall,0.72895
eval/runtime,28.4086
eval/samples_per_second,30.519
eval/steps_per_second,3.837
test/runtime,58.4947
test/samples_per_second,29.233
