# Installing dependencies

In [None]:
!pip install datasets \
            evaluate
!pip install datasets --upgrade

# Set-up

Make sure you are logged in to wandb to run the code as is, otherwise disable the logging to wandb.

In [None]:
!wandb login

Use your HuggingFace token to login to HuggingFace and load in the dataset and model.

In [None]:
from huggingface_hub import login
your_hf_token = "your_hf_token_here"  # Replace with your actual Hugging Face token
login(token=your_hf_token, add_to_git_credential=True)

In [None]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate
import wandb
import os
import torch

In [None]:
DEPTH = 1
QDep = False
NO_RCONC = False

ALL_EXAMPLES = False
PROOF_DEPTH = 1
PROOF_STRATEGY = "proof"

In [None]:
dataset_name = f"andres-vs/ruletaker-Att-Noneg"

if QDep:
    dataset_name += f"-QDep{DEPTH}"
    if NO_RCONC:
        dataset_name += "-NoRconc"
else:
    dataset_name += f"-depth{DEPTH}"
model_name = "google-bert/bert-base-uncased"

In [None]:
dataset = load_dataset(dataset_name)

# Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
if ALL_EXAMPLES:
    filtered_dataset = dataset
else:
    # Filter the dataset
    filtered_dataset = dataset.filter(lambda example: example['depth'] == PROOF_DEPTH and example['proof_strategy'] == PROOF_STRATEGY)

In [None]:
def tokenize_function(examples):
    tokenized_output = tokenizer(examples["input"], truncation=True, padding=False)
    # Convert labels to one-hot encoding using PyTorch
    labels = torch.tensor(examples['label'], dtype=torch.int64)
    one_hot_labels = torch.nn.functional.one_hot(labels, num_classes=2).float()
    tokenized_output['label'] = one_hot_labels.tolist()  # Convert back to list for datasets
    return tokenized_output

# Apply the updated tokenize_function
tokenized_datasets = filtered_dataset.map(tokenize_function, batched=True)

# Training

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
metric = evaluate.load("accuracy")

In [None]:
from transformers import DataCollatorWithPadding, EarlyStoppingCallback
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]=f"{model_name.split('/')[1]}-finetuned_{dataset_name.split('/ruletaker-')[1]}"
# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="end"
# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Convert one-hot encoded labels back to single integers
    references = np.argmax(labels, axis=-1)
    return metric.compute(predictions=predictions, references=references)

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned_{dataset_name.split('/ruletaker-')[1]}_retrained3",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_grad_norm=1.0,
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_steps=100,
    num_train_epochs=20,
    fp16=True,
    push_to_hub=True,
    report_to="wandb",
)

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,   # Stop if no improvement for 3 epochs
    early_stopping_threshold=0.001  # Minimum improvement threshold
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    # callbacks=[early_stopping],
    data_collator=data_collator
)

In [None]:
wandb.init()
trainer.train()
wandb.finish()

# Testing

In [None]:
RETRAINED = False
RETRAINED_NUMBER = 1

In [None]:
if QDep:
    if DEPTH == 0:
        model_name = f"andres-vs/bert-base-uncased-finetuned_Att-Noneg-QDep0"
    else:
        if RETRAINED:
            model_name = f"andres-vs/bert-base-uncased-finetuned_Att-Noneg-QDep{DEPTH}-NoRconc_retrained"
        else:
            model_name = f"andres-vs/bert-base-uncased-finetuned_Att-Noneg-QDep{DEPTH}-NoRconc"
else:
    if RETRAINED:
        model_name = f"andres-vs/bert-base-uncased-finetuned_Att-Noneg-depth{DEPTH}_retrained{RETRAINED_NUMBER}"
    else:
        model_name = f"andres-vs/bert-base-uncased-finetuned_Att-Noneg-depth{DEPTH}"

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(model_name)

# Define the metric
metric = evaluate.load("accuracy")

# Check if CUDA is available and move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to compute predictions
def compute_predictions(batch):
    # Move inputs to the same device as the model
    inputs = tokenizer(batch['input'], padding='max_length', truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Return a dictionary containing the logits and move them back to CPU
    return {"logits": outputs.logits.cpu().numpy()}

# Compute predictions and evaluate
predictions = tokenized_datasets["test"].map(compute_predictions, batched=True, batch_size=16)

predicted_labels = np.argmax(predictions["logits"], axis=-1)
true_labels = tokenized_datasets["test"]["label"]
true_labels = np.argmax(true_labels, axis=1)  # Convert to class indices

accuracy_result = metric.compute(predictions=predicted_labels, references=true_labels)

print(f"Accuracy on the test set: {accuracy_result['accuracy']}")

In [None]:
api = wandb.Api()

PROJECT_NAME = model_name.split("/")[1].split("_")[0] + "_" + dataset_name.split("/ruletaker-")[1]
RUN_ID = "3zsbpkut"

run = api.run(f"andresvanschel/{PROJECT_NAME}/{RUN_ID}")
run.summary["test/accuracy"] = accuracy_result['accuracy']
run.summary.update()

In [None]:
# Find the indices where predictions do not match true labels
mismatched_indices = np.where(predicted_labels != true_labels)[0]

# Print the input sequence for the mismatched examples
print("Examples where predicted labels do not match true labels:")
for index in mismatched_indices:
    print(f"Index: {index}")
    print(f"True Label: {true_labels[index]}")
    print(f"Predicted Label: {predicted_labels[index]}")
    print(f"Input Sequence: {tokenized_datasets['test']['input'][index]}")
    print("-" * 20)

Examples where predicted labels do not match true labels:
Index: 375
True Label: 1
Predicted Label: 0
Input Sequence: Fiona is quiet. Fiona is big. Harry is quiet. Fiona is blue. Harry is blue. If someone is green and big then they are cold. Fiona is white. Harry is green. Young, blue people are big. If someone is green and blue then they are cold. If someone is young then they are blue. If someone is young and blue then they are white. Fiona is green. Harry is white. Fiona is young. All cold, quiet people are green.[SEP]Harry is cold.
--------------------
