# Task
Fine-tune a `legal-bert-small-uncased` model for binary sentence classification on the dataset located at "PLUE/PLUE-main/data/privacyqa". The data is tab-separated and preprocessed, with separate train and test files. Hold out 10% of the training data as a development set.

## Load the data

### Subtask:
Load the tab-separated training and testing data from the specified paths.


In [None]:
from datasets import load_dataset, Features, Value

# Load the training dataset, explicitly specifying column names and header row
train_dataset = load_dataset(
    "csv",
    data_files="PLUE/PLUE-main/data/privacyqa/policy_train_data.csv",
    sep='\t',
    header=0, # Indicate that the first row is the header
    column_names=['Folder', 'DocID', 'QueryID', 'SentID', 'Split', 'Query', 'Segment', 'Label'] # Explicitly specify all column names
)

# Load the testing dataset, explicitly specifying column names and header row
test_dataset = load_dataset(
    "csv",
    data_files="PLUE/PLUE-main/data/privacyqa/policy_test_data.csv",
    sep='\t',
    header=0, # Indicate that the first row is the header
    column_names=['Folder', 'DocID', 'QueryID', 'SentID', 'Split', 'Query', 'Segment', 'Any_Relevant'] # Explicitly specify all column names
)


# Manually convert "Relevant" and "Irrelevant" to binary in training data, handle other cases
def convert_label_to_binary_train(example):
    if example['Label'] == 'Relevant':
        example['Label'] = 1
    elif example['Label'] == 'Irrelevant':
        example['Label'] = 0
    else:
        example['Label'] = -1 # Assign a default value for unexpected labels
    return example

# Manually convert "Relevant" and "Irrelevant" to binary in testing data, handle other cases
def convert_label_to_binary_test(example):
    if example['Any_Relevant'] == 'Relevant':
        example['Any_Relevant'] = 1
    elif example['Any_Relevant'] == 'Irrelevant':
        example['Any_Relevant'] = 0
    else:
        example['Any_Relevant'] = -1 # Assign a default value for unexpected labels
    return example


train_dataset = train_dataset.map(convert_label_to_binary_train)
test_dataset = test_dataset.map(convert_label_to_binary_test)

# Cast the label columns to int64 after conversion
train_dataset = train_dataset.cast_column('Label', Value('int64'))
test_dataset = test_dataset.cast_column('Any_Relevant', Value('int64'))

# Cast 'Query' and 'Segment' columns to string to ensure they are in the correct format for tokenization
train_dataset = train_dataset.cast_column('Query', Value('string'))
train_dataset = train_dataset.cast_column('Segment', Value('string'))
test_dataset = test_dataset.cast_column('Query', Value('string'))
test_dataset = test_dataset.cast_column('Segment', Value('string'))


# Hold out 10% of the training data as a development set
train_test_split = train_dataset['train'].train_test_split(test_size=0.1, seed=42)
train_dataset_split = train_test_split['train']
dev_dataset = train_test_split['test']


# Display the first few examples of the training dataset split
print("Training data split:")
display(train_dataset_split[:5])

# Display the first few examples of the development dataset
print("\nDevelopment data:")
display(dev_dataset[:5])

Training data split:


{'Folder': ['../../Dataset/Train/com.intuit.quickbooks',
  '../../Dataset/Train/com.wordwebsoftware.android.wordweb',
  '../../Dataset/Train/com.intuit.quickbooks',
  '../../Dataset/Train/com.cleanmaster.mguard',
  '../../Dataset/Train/com.reddit.frontpage'],
 'DocID': ['QuickBooks Accounting: Invoicing & Expenses _9',
  'Dictionary _22',
  'QuickBooks Accounting: Invoicing & Expenses _9',
  'Clean Master_2',
  'Reddit: Top News, Trending Memes & Crypto Updates _13'],
 'QueryID': ['QuickBooks Accounting: Invoicing & Expenses _9_46',
  'Dictionary _22_16',
  'QuickBooks Accounting: Invoicing & Expenses _9_30',
  'Clean Master_2_13',
  'Reddit: Top News, Trending Memes & Crypto Updates _13_6'],
 'SentID': ['QuickBooks Accounting: Invoicing & Expenses _9_46_151',
  'Dictionary _22_16_25',
  'QuickBooks Accounting: Invoicing & Expenses _9_30_205',
  'Clean Master_2_13_106',
  'Reddit: Top News, Trending Memes & Crypto Updates _13_6_174'],
 'Split': ['train', 'train', 'train', 'train', 'tra


Development data:


{'Folder': ['../../Dataset/Train/com.cleanmaster.mguard',
  '../../Dataset/Train/com.maxmpz.audioplayer',
  '../../Dataset/Train/org.brilliant.android',
  '../../Dataset/Train/com.usps',
  '../../Dataset/Train/com.goodreads'],
 'DocID': ['Clean Master_2',
  'Poweramp Music Player (Trial) _10',
  'Brilliant _26',
  'USPS MOBILE _20',
  'Goodreads _6'],
 'QueryID': ['Clean Master_2_12',
  'Poweramp Music Player (Trial) _10_11',
  'Brilliant _26_20',
  'USPS MOBILE _20_29',
  'Goodreads _6_16'],
 'SentID': ['Clean Master_2_12_158',
  'Poweramp Music Player (Trial) _10_11_56',
  'Brilliant _26_20_87',
  'USPS MOBILE _20_29_61',
  'Goodreads _6_16_40'],
 'Split': ['train', 'train', 'train', 'train', 'train'],
 'Query': ['will the app send or keep information about me?',
  'will my real name be required?',
  'is this site safe for kids?',
  'is my location tracked and recorded by the mobile app?',
  'can i delete my reading history?'],
 'Segment': ['Where denial of access is required or auth

In [None]:
from transformers import AutoTokenizer
from datasets import Features, Value

# Load the tokenizer for legal-bert-small-uncased
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-small-uncased")

# Tokenize the datasets
def tokenize_function(examples):
    # Ensure inputs are strings, handle potential None values
    queries = [str(q) if q is not None else "" for q in examples["Query"]]
    segments = [str(s) if s is not None else "" for s in examples["Segment"]]
    # The task is binary classification, so we concatenate the query and segment
    tokenized_inputs = tokenizer(queries, segments, truncation=True, padding="max_length")
    # Add the label column to the tokenized inputs
    # Check if 'Label' exists (for train/dev) or 'Any_Relevant' exists (for test)
    if 'Label' in examples:
        tokenized_inputs['labels'] = examples['Label'] # Rename 'Label' to 'labels'
    elif 'Any_Relevant' in examples:
        tokenized_inputs['labels'] = examples['Any_Relevant'] # Rename 'Any_Relevant' to 'labels' for consistency (though not used for training loss)
    return tokenized_inputs


# Assuming train_dataset_split, dev_dataset, and test_dataset are available from previous steps
# Map without explicit features
tokenized_train_dataset = train_dataset_split.map(tokenize_function, batched=True)
tokenized_dev_dataset = dev_dataset.map(tokenize_function, batched=True)
# Note: test_dataset is a Dataset, not a DatasetDict, so directly use it
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


# Save the tokenized datasets to disk
tokenized_train_dataset.save_to_disk("tokenized_train_dataset_privacyqa")
tokenized_dev_dataset.save_to_disk("tokenized_dev_dataset_privacyqa")
tokenized_test_dataset.save_to_disk("tokenized_test_dataset_privacyqa")

print("Tokenized datasets saved.")

Saving the dataset (0/2 shards):   0%|          | 0/166680 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18520 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/62150 [00:00<?, ? examples/s]

Tokenized datasets saved.


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the pre-trained model for binary sequence classification
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-small-uncased", num_labels=2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results_privacyqa",  # Output directory
    num_train_epochs=3,  # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=64,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    logging_dir="./logs_privacyqa",  # Directory for storing logs
    logging_steps=10, # Log every 10 steps
    eval_strategy="epoch", # Evaluate every epoch
    save_strategy="epoch", # Save checkpoint every epoch
    load_best_model_at_end=True, # Load the best model at the end of training
    metric_for_best_model="f1", # Use F1 score to determine the best model
    report_to="none" # Disable reporting to external services
)

# Initialize the Trainer
# We will add the datasets and metrics later

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# Define the evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset, # training dataset
    eval_dataset=tokenized_dev_dataset,   # evaluation dataset
    compute_metrics=compute_metrics      # the callback that computes metrics of interest
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1228,0.144054,0.765625,0.132972,0.22659
2,0.14,0.119501,0.660194,0.276798,0.390057
3,0.0538,0.120362,0.698925,0.352782,0.468891


TrainOutput(global_step=31254, training_loss=0.13055259338667977, metrics={'train_runtime': 2049.518, 'train_samples_per_second': 243.979, 'train_steps_per_second': 15.249, 'total_flos': 2.94613160767488e+16, 'train_loss': 0.13055259338667977, 'epoch': 3.0})

## Evaluate on the development set

### Subtask:
Use the trained model to evaluate performance on the `tokenized_dev_dataset` using the defined `compute_metrics`.


In [None]:
# Evaluate the model on the development set
eval_results_run1 = trainer.evaluate(eval_dataset=tokenized_dev_dataset)

# Print the evaluation results for the first run
print("Evaluation results for run 1:")
print(eval_results_run1)

Evaluation results for run 1:
{'eval_loss': 0.12036244571208954, 'eval_precision': 0.6989247311827957, 'eval_recall': 0.35278154681139756, 'eval_f1': 0.4688908926961226, 'eval_runtime': 38.3379, 'eval_samples_per_second': 483.073, 'eval_steps_per_second': 7.564, 'epoch': 3.0}


## Store evaluation results

### Subtask:
Store the evaluation metrics (Precision, Recall, F1) from the first run.


**Reasoning**:
Extract the required evaluation metrics from the `eval_results_run1` dictionary and store them in a list.



In [None]:
# Extract the metrics from the first run
run1_metrics = {
    'precision': eval_results_run1['eval_precision'],
    'recall': eval_results_run1['eval_recall'],
    'f1': eval_results_run1['eval_f1']
}

# Store the metrics from the first run
all_runs_metrics = [run1_metrics]

print("Metrics from Run 1 stored:")
print(all_runs_metrics)

Metrics from Run 1 stored:
[{'precision': 0.6989247311827957, 'recall': 0.35278154681139756, 'f1': 0.4688908926961226}]


## Repeat training and evaluation (run 2)

### Subtask:
Re-initialize the model and trainer (if necessary, depending on how the trainer handles multiple calls to `train`), train the model again, and evaluate on the development set.


**Reasoning**:
Re-initialize the model and trainer, train the model for the second run, evaluate on the development set, and store the results.



In [None]:
# Re-initialize the pre-trained model for binary sequence classification
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-small-uncased", num_labels=2)

# Re-initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset, # training dataset
    eval_dataset=tokenized_dev_dataset,   # evaluation dataset
    compute_metrics=compute_metrics      # the callback that computes metrics of interest
)

# Start training for the second run
print("Starting training run 2...")
trainer.train()

# Evaluate the model on the development set for the second run
eval_results_run2 = trainer.evaluate(eval_dataset=tokenized_dev_dataset)

# Print the evaluation results for the second run
print("\nEvaluation results for run 2:")
print(eval_results_run2)

# Store the metrics from the second run
run2_metrics = {
    'precision': eval_results_run2['eval_precision'],
    'recall': eval_results_run2['eval_recall'],
    'f1': eval_results_run2['eval_f1']
}
all_runs_metrics.append(run2_metrics)

print("\nMetrics from Run 2 stored:")
print(all_runs_metrics)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training run 2...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1211,0.133627,0.670683,0.226594,0.338742
2,0.1272,0.11441,0.658711,0.374491,0.477509
3,0.0399,0.118563,0.713604,0.405699,0.517301



Evaluation results for run 2:
{'eval_loss': 0.1185634508728981, 'eval_precision': 0.7136038186157518, 'eval_recall': 0.4056987788331072, 'eval_f1': 0.5173010380622838, 'eval_runtime': 38.9255, 'eval_samples_per_second': 475.781, 'eval_steps_per_second': 7.45, 'epoch': 3.0}

Metrics from Run 2 stored:
[{'precision': 0.6989247311827957, 'recall': 0.35278154681139756, 'f1': 0.4688908926961226}, {'precision': 0.7136038186157518, 'recall': 0.4056987788331072, 'f1': 0.5173010380622838}]


**Reasoning**:
The first step is to load the data from the CSV file into a pandas DataFrame and display the first few rows to understand its structure.



In [None]:
# Repeat training and evaluation for the third run
print("Starting training run 3...")

# Re-initialize the pre-trained model for binary sequence classification
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-small-uncased", num_labels=2)

# Re-initialize the Trainer with the same arguments and datasets
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset, # training dataset
    eval_dataset=tokenized_dev_dataset,   # evaluation dataset
    compute_metrics=compute_metrics      # the callback that computes metrics of interest
)

# Start training for the third run
trainer.train()

# Evaluate the model on the development set for the third run
eval_results_run3 = trainer.evaluate(eval_dataset=tokenized_dev_dataset)

# Print the evaluation results for the third run
print("\nEvaluation results for run 3:")
print(eval_results_run3)

# Store the metrics from the third run
run3_metrics = {
    'precision': eval_results_run3['eval_precision'],
    'recall': eval_results_run3['eval_recall'],
    'f1': eval_results_run3['eval_f1']
}
all_runs_metrics.append(run3_metrics)

print("\nMetrics from Run 3 stored:")
print(all_runs_metrics)

Starting training run 3...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1211,0.133627,0.670683,0.226594,0.338742
2,0.1272,0.11441,0.658711,0.374491,0.477509
3,0.0403,0.118325,0.720195,0.401628,0.515679



Evaluation results for run 3:
{'eval_loss': 0.11832477152347565, 'eval_precision': 0.7201946472019465, 'eval_recall': 0.4016282225237449, 'eval_f1': 0.5156794425087108, 'eval_runtime': 39.0024, 'eval_samples_per_second': 474.843, 'eval_steps_per_second': 7.435, 'epoch': 3.0}

Metrics from Run 3 stored:
[{'precision': 0.6989247311827957, 'recall': 0.35278154681139756, 'f1': 0.4688908926961226}, {'precision': 0.7136038186157518, 'recall': 0.4056987788331072, 'f1': 0.5173010380622838}, {'precision': 0.7201946472019465, 'recall': 0.4016282225237449, 'f1': 0.5156794425087108}]


In [None]:
import pandas as pd

# Convert the list of dictionaries to a pandas DataFrame for easier calculation
metrics_df = pd.DataFrame(all_runs_metrics)

# Calculate the average metrics
average_metrics = metrics_df.mean()

# Display the average metrics
print("\nAverage Evaluation Metrics Across 3 Runs:")
display(average_metrics)


Average Evaluation Metrics Across 3 Runs:


precision    0.710908
recall       0.386703
f1           0.500624
dtype: float64

## Hyperparameter tuning

### Subtask:
Define a search space for the identified hyperparameters.


In [None]:
# Define the search space for hyperparameter tuning
search_space = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'per_device_train_batch_size': [8,16, 32],
    'num_train_epochs': [2, 3, 5, 10],
    'weight_decay': [0.0, 0.01]
}

# Print the search space to confirm its creation
print("Hyperparameter search space:")
print(search_space)

Hyperparameter search space:
{'learning_rate': [5e-05, 3e-05, 2e-05], 'per_device_train_batch_size': [8, 16, 32], 'num_train_epochs': [2, 3, 5, 10], 'weight_decay': [0.0, 0.01]}


## Set up the tuning process

### Subtask:
Implement the hyperparameter tuning loop using Random Search. This involves iterating through different combinations of hyperparameters, training the model, evaluating on the development set, and storing the results.


In [None]:
import itertools
import pandas as pd
import random # Import random for sampling
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# Define the evaluation metrics (re-defining for clarity in this cell)
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

# Define the search space for hyperparameter tuning (assuming search_space variable is available from a previous cell)
# If not, uncomment and define it here:
search_space = {
     'learning_rate': [5e-5, 3e-5, 2e-5],
     'per_device_train_batch_size': [8, 16],
     'num_train_epochs': [1, 2, 5],
     'weight_decay': [0.0, 0.01]
 }

# Define the number of trials for Random Search
num_random_trials = 10 # You can adjust this number

# Store results of each trial
tuning_results = []

# Get all possible combinations (for sampling)
all_combinations = list(itertools.product(*search_space.values()))

# Ensure the number of random trials does not exceed the total number of combinations
if num_random_trials > len(all_combinations):
    print(f"Warning: Number of random trials ({num_random_trials}) is greater than the total number of combinations ({len(all_combinations)}). Running all combinations instead.")
    random_combinations = all_combinations
    num_random_trials = len(all_combinations)
else:
    # Randomly sample combinations for the defined number of trials
    random_combinations = random.sample(all_combinations, num_random_trials)


print(f"Starting manual hyperparameter tuning with Random Search ({num_random_trials} trials).")

# Manual hyperparameter tuning loop (Random Search)
for i, combo in enumerate(random_combinations):
    current_hyperparameters = dict(zip(search_space.keys(), combo))
    print(f"\n--- Running Trial {i+1}/{num_random_trials} with hyperparameters: {current_hyperparameters} ---")

    # Re-initialize the pre-trained model for binary sequence classification for each trial
    model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-small-uncased", num_labels=2)

    # Define the training arguments for the current trial
    # Update the base training_args with current hyperparameters
    current_training_args = TrainingArguments(
        output_dir=f"./results_privacyqa_manual_tune_{i+1}",  # Unique output directory for each trial
        num_train_epochs=current_hyperparameters['num_train_epochs'],
        per_device_train_batch_size=current_hyperparameters['per_device_train_batch_size'],
        learning_rate=current_hyperparameters['learning_rate'],
        weight_decay=current_hyperparameters['weight_decay'],
        # Keep other arguments from the original training_args or define them here
        per_device_eval_batch_size=64, # Using a fixed eval batch size
        warmup_steps=500,
        logging_dir=f"./logs_privacyqa_manual_tune_{i+1}", # Unique logging directory
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )


    # Initialize the Trainer for the current trial
    trainer = Trainer(
        model=model,
        args=current_training_args,
        train_dataset=tokenized_train_dataset, # Using the tokenized training data
        eval_dataset=tokenized_dev_dataset,   # Using the tokenized development data
        compute_metrics=compute_metrics
    )

    # Start training for the current trial
    trainer.train()

    # Evaluate the model on the development set
    eval_results = trainer.evaluate(eval_dataset=tokenized_dev_dataset)

    # Store the hyperparameters and evaluation metrics
    trial_results = {
        'trial': i + 1,
        'hyperparameters': current_hyperparameters,
        'eval_precision': eval_results['eval_precision'],
        'eval_recall': eval_results['eval_recall'],
        'eval_f1': eval_results['eval_f1'],
        'eval_loss': eval_results['eval_loss'] # Also store loss for completeness
    }
    tuning_results.append(trial_results)

    # Optional: Print intermediate results
    print(f"Trial {i+1} completed. Results: {trial_results}")

# Convert results to a DataFrame for easier analysis
tuning_results_df = pd.DataFrame(tuning_results)

# Find the best trial based on F1 score
best_trial = tuning_results_df.loc[tuning_results_df['eval_f1'].idxmax()]

print("\n--- Hyperparameter Tuning Completed ---")
print("\nTuning Results:")
display(tuning_results_df)

print("\nBest Trial:")
display(best_trial)

print("\nBest Hyperparameters:")
print(best_trial['hyperparameters'])

Starting manual hyperparameter tuning with Random Search (10 trials).

--- Running Trial 1/10 with hyperparameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1, 'weight_decay': 0.01} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1108,0.130136,0.662338,0.207598,0.316116


Trial 1 completed. Results: {'trial': 1, 'hyperparameters': {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1, 'weight_decay': 0.01}, 'eval_precision': 0.6623376623376623, 'eval_recall': 0.20759837177747625, 'eval_f1': 0.31611570247933884, 'eval_loss': 0.130136176943779}

--- Running Trial 2/10 with hyperparameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 1, 'weight_decay': 0.01} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1219,0.147992,0.64557,0.207598,0.314168


Trial 2 completed. Results: {'trial': 2, 'hyperparameters': {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 1, 'weight_decay': 0.01}, 'eval_precision': 0.6455696202531646, 'eval_recall': 0.20759837177747625, 'eval_f1': 0.3141683778234086, 'eval_loss': 0.14799249172210693}

--- Running Trial 3/10 with hyperparameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.01} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1722,0.132304,0.752137,0.238806,0.362513
2,0.1671,0.099934,0.802247,0.484396,0.604061
3,0.0782,0.100512,0.837476,0.594301,0.695238
4,0.0889,0.093233,0.858696,0.643148,0.735454
5,0.0004,0.091121,0.871199,0.660787,0.751543


Trial 3 completed. Results: {'trial': 3, 'hyperparameters': {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.01}, 'eval_precision': 0.8711985688729875, 'eval_recall': 0.66078697421981, 'eval_f1': 0.7515432098765432, 'eval_loss': 0.09112071245908737}

--- Running Trial 4/10 with hyperparameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.01} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1301,0.146278,0.744898,0.1981,0.312969
2,0.1781,0.122837,0.778824,0.449118,0.569707


Trial 4 completed. Results: {'trial': 4, 'hyperparameters': {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.01}, 'eval_precision': 0.7788235294117647, 'eval_recall': 0.4491180461329715, 'eval_f1': 0.5697074010327022, 'eval_loss': 0.12283703684806824}

--- Running Trial 5/10 with hyperparameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.0} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1861,0.148603,0.680297,0.248304,0.363817
2,0.1649,0.135588,0.739011,0.364993,0.488647


Trial 5 completed. Results: {'trial': 5, 'hyperparameters': {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.0}, 'eval_precision': 0.739010989010989, 'eval_recall': 0.3649932157394844, 'eval_f1': 0.48864668483197093, 'eval_loss': 0.13558758795261383}

--- Running Trial 6/10 with hyperparameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.0} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1874,0.190465,0.0,0.0,0.0
2,0.1748,0.181913,0.0,0.0,0.0
3,0.0134,0.186567,0.0,0.0,0.0
4,0.1644,0.181283,0.0,0.0,0.0
5,0.1768,0.184149,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Trial 6 completed. Results: {'trial': 6, 'hyperparameters': {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.0}, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_loss': 0.19046536087989807}

--- Running Trial 7/10 with hyperparameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 5, 'weight_decay': 0.01} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1147,0.115124,0.717087,0.347354,0.468007
2,0.0992,0.087391,0.750896,0.568521,0.647104
3,0.0305,0.081926,0.870229,0.618725,0.723236
4,0.0176,0.068838,0.911985,0.660787,0.766326
5,0.0992,0.072119,0.896853,0.696065,0.783804


Trial 7 completed. Results: {'trial': 7, 'hyperparameters': {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 5, 'weight_decay': 0.01}, 'eval_precision': 0.8968531468531469, 'eval_recall': 0.6960651289009498, 'eval_f1': 0.7838044308632544, 'eval_loss': 0.07211901992559433}

--- Running Trial 8/10 with hyperparameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.01} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1668,0.185976,0.555556,0.027137,0.051746
2,0.1674,0.15671,0.704348,0.109905,0.190141


Trial 8 completed. Results: {'trial': 8, 'hyperparameters': {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.01}, 'eval_precision': 0.7043478260869566, 'eval_recall': 0.10990502035278155, 'eval_f1': 0.19014084507042253, 'eval_loss': 0.15671034157276154}

--- Running Trial 9/10 with hyperparameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2, 'weight_decay': 0.01} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0946,0.127594,0.722807,0.279512,0.403131
2,0.0867,0.107883,0.763889,0.447761,0.564585


Trial 9 completed. Results: {'trial': 9, 'hyperparameters': {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2, 'weight_decay': 0.01}, 'eval_precision': 0.7638888888888888, 'eval_recall': 0.44776119402985076, 'eval_f1': 0.564585115483319, 'eval_loss': 0.10788282006978989}

--- Running Trial 10/10 with hyperparameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.01} ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1912,0.143259,0.744681,0.237449,0.360082
2,0.1697,0.108082,0.767967,0.507463,0.611111
3,0.0022,0.103372,0.888651,0.563094,0.689369
4,0.0013,0.095156,0.866792,0.626866,0.727559
5,0.0011,0.093789,0.879406,0.643148,0.742947


Trial 10 completed. Results: {'trial': 10, 'hyperparameters': {'learning_rate': 2e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.01}, 'eval_precision': 0.8794063079777366, 'eval_recall': 0.6431478968792401, 'eval_f1': 0.7429467084639498, 'eval_loss': 0.09378870576620102}

--- Hyperparameter Tuning Completed ---

Tuning Results:


Unnamed: 0,trial,hyperparameters,eval_precision,eval_recall,eval_f1,eval_loss
0,1,"{'learning_rate': 5e-05, 'per_device_train_bat...",0.662338,0.207598,0.316116,0.130136
1,2,"{'learning_rate': 5e-05, 'per_device_train_bat...",0.64557,0.207598,0.314168,0.147992
2,3,"{'learning_rate': 3e-05, 'per_device_train_bat...",0.871199,0.660787,0.751543,0.091121
3,4,"{'learning_rate': 3e-05, 'per_device_train_bat...",0.778824,0.449118,0.569707,0.122837
4,5,"{'learning_rate': 3e-05, 'per_device_train_bat...",0.739011,0.364993,0.488647,0.135588
5,6,"{'learning_rate': 5e-05, 'per_device_train_bat...",0.0,0.0,0.0,0.190465
6,7,"{'learning_rate': 3e-05, 'per_device_train_bat...",0.896853,0.696065,0.783804,0.072119
7,8,"{'learning_rate': 5e-05, 'per_device_train_bat...",0.704348,0.109905,0.190141,0.15671
8,9,"{'learning_rate': 3e-05, 'per_device_train_bat...",0.763889,0.447761,0.564585,0.107883
9,10,"{'learning_rate': 2e-05, 'per_device_train_bat...",0.879406,0.643148,0.742947,0.093789



Best Trial:


trial                                                              7
hyperparameters    {'learning_rate': 3e-05, 'per_device_train_bat...
eval_precision                                              0.896853
eval_recall                                                 0.696065
eval_f1                                                     0.783804
eval_loss                                                   0.072119
Name: 6, dtype: object


Best Hyperparameters:
{'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 5, 'weight_decay': 0.01}


In [None]:
import os
import glob

# Assuming best_trial_number is available from previous cells
# Replace with the actual best trial number if different
best_trial_number = 7

# Define the expected directory path for the best model run
best_model_run_dir = f"./results_privacyqa_manual_tune_{best_trial_number}"

print(f"Checking for directory: {best_model_run_dir}")

# Check if the directory exists
if os.path.exists(best_model_run_dir):
    print(f"Directory exists: {best_model_run_dir}")

    # Check for checkpoint directories within the run directory
    checkpoint_pattern = os.path.join(best_model_run_dir, "checkpoint-*")
    list_of_checkpoints = glob.glob(checkpoint_pattern)

    if list_of_checkpoints:
        print(f"\nFound checkpoint directories in {best_model_run_dir}:")
        for checkpoint_dir in list_of_checkpoints:
            print(checkpoint_dir)

        latest_checkpoint = max(list_of_checkpoints, key=os.path.getctime)
        print(f"\nChecking contents of the latest checkpoint directory: {latest_checkpoint}")
        print("Files in latest checkpoint:")
        for root, dirs, files in os.walk(latest_checkpoint):
            for name in files:
                print(os.path.join(root, name))
            # Only list files in the top-level checkpoint directory for brevity
            break


    else:
        print(f"\nNo checkpoint directories found in {best_model_run_dir}.")

else:
    print(f"Directory does not exist: {best_model_run_dir}")

Checking for directory: ./results_privacyqa_manual_tune_7
Directory exists: ./results_privacyqa_manual_tune_7

Found checkpoint directories in ./results_privacyqa_manual_tune_7:
./results_privacyqa_manual_tune_7\checkpoint-10418
./results_privacyqa_manual_tune_7\checkpoint-20836
./results_privacyqa_manual_tune_7\checkpoint-31254
./results_privacyqa_manual_tune_7\checkpoint-41672
./results_privacyqa_manual_tune_7\checkpoint-52090

Checking contents of the latest checkpoint directory: ./results_privacyqa_manual_tune_7\checkpoint-52090
Files in latest checkpoint:
./results_privacyqa_manual_tune_7\checkpoint-52090\config.json
./results_privacyqa_manual_tune_7\checkpoint-52090\model.safetensors
./results_privacyqa_manual_tune_7\checkpoint-52090\optimizer.pt
./results_privacyqa_manual_tune_7\checkpoint-52090\rng_state.pth
./results_privacyqa_manual_tune_7\checkpoint-52090\scheduler.pt
./results_privacyqa_manual_tune_7\checkpoint-52090\trainer_state.json
./results_privacyqa_manual_tune_7\chec

## Evaluate Best Model on Test Set

### Subtask:
Load the best performing model from the hyperparameter tuning checkpoint and evaluate it on the filtered test dataset using the GPU.

In [None]:
import torch
from transformers import AutoModelForSequenceClassification
import glob
import os

# Assuming best_trial_number is available from previous cells or determined
# based on your tuning results.
best_trial_number = 7 # Replace with the actual best trial number if different

# Find the actual checkpoint directory within the best trial's output directory
best_model_run_dir = f"./results_privacyqa_manual_tune_{best_trial_number}"
list_of_checkpoints = glob.glob(os.path.join(best_model_run_dir, "checkpoint-*"))
latest_checkpoint_dir = max(list_of_checkpoints, key=os.path.getctime) if list_of_checkpoints else None

# Define the path to the model weights file within the checkpoint directory
# Assuming the model weights are saved in model.safetensors or pytorch_model.bin
model_weights_path = None
if latest_checkpoint_dir:
    safetensors_path = os.path.join(latest_checkpoint_dir, "model.safetensors")
    pytorch_bin_path = os.path.join(latest_checkpoint_dir, "pytorch_model.bin")

    if os.path.exists(safetensors_path):
        model_weights_path = safetensors_path
        print(f"Found model weights at: {model_weights_path}")
    elif os.path.exists(pytorch_bin_path):
        model_weights_path = pytorch_bin_path
        print(f"Found model weights at: {model_weights_path}")
    else:
        print(f"No model weights file found in {latest_checkpoint_dir}")


if model_weights_path:
    print("\n--- Reloading Model and Loading Checkpoint State ---")

    try:
        # Reload the base model architecture
        model_reloaded = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-small-uncased", num_labels=2)
        print("Base model reloaded.")

        # Load the state dictionary from the checkpoint
        if model_weights_path.endswith(".safetensors"):
             from safetensors.torch import load_file
             state_dict = load_file(model_weights_path)
        else: # Assuming it's pytorch_model.bin
             state_dict = torch.load(model_weights_path, map_location='cpu') # Load to CPU first

        # Load the state dictionary into the reloaded model
        model_reloaded.load_state_dict(state_dict)
        print("Checkpoint state loaded into the reloaded model.")

        # Set the model to evaluation mode
        model_reloaded.eval()



        # Now you can use model_reloaded for evaluation
        # You would typically pass this model_reloaded to your evaluation logic

    except Exception as e:
        print(f"\nAn error occurred during model reloading or state loading: {e}")
        print("Please check the error message and the checkpoint file.")

else:
    print(f"Could not find a valid model weights file in the latest checkpoint directory: {latest_checkpoint_dir}")

Found model weights at: ./results_privacyqa_manual_tune_7\checkpoint-52090\model.safetensors

--- Reloading Model and Loading Checkpoint State ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model reloaded.
Checkpoint state loaded into the reloaded model.


## Prepare the test dataset

### Subtask:
Ensure the test dataset is in the correct format and filter out any examples with irrelevant labels (e.g., label -1).


In [None]:
# Assuming test_dataset is available from previous cells

print("Original Test Dataset Info:")
print(test_dataset)
# Access the features from the 'train' split of the DatasetDict
if isinstance(test_dataset, dict) and 'train' in test_dataset:
    print("\nOriginal Test Dataset Features:")
    print(test_dataset['train'].features)
    print("\nFirst 5 examples of Original Test Dataset:")
    display(test_dataset['train'][:5])
else:
    print("\nOriginal Test Dataset Features:")
    print(test_dataset.features)
    print("\nFirst 5 examples of Original Test Dataset:")
    display(test_dataset[:5])


# Filter the test dataset to keep only examples where the 'Any_Relevant' or 'labels' column has a value of 0 or 1
# Check which column exists and use it for filtering
label_column = None
# Check the features of the 'train' split if it's a DatasetDict
if isinstance(test_dataset, dict) and 'train' in test_dataset and 'labels' in test_dataset['train'].features:
    label_column = 'labels'
    test_dataset_to_filter = test_dataset['train']
elif isinstance(test_dataset, dict) and 'train' in test_dataset and 'Any_Relevant' in test_dataset['train'].features:
     label_column = 'Any_Relevant'
     test_dataset_to_filter = test_dataset['train']
elif 'labels' in test_dataset.features:
    label_column = 'labels'
    test_dataset_to_filter = test_dataset
elif 'Any_Relevant' in test_dataset.features:
    label_column = 'Any_Relevant'
    test_dataset_to_filter = test_dataset
else:
    raise ValueError("Neither 'labels' nor 'Any_Relevant' column found in test_dataset.")


print(f"\nFiltering test dataset using column: {label_column}")

filtered_test_dataset = test_dataset_to_filter.filter(lambda example: example[label_column] in [0, 1])

# Display the first few examples and the structure of the filtered_test_dataset
print("\nFiltered Test Dataset Info:")
print(filtered_test_dataset)
print("\nFiltered Test Dataset Features:")
print(filtered_test_dataset.features)
print("\nFirst 5 examples of Filtered Test Dataset:")
display(filtered_test_dataset[:5])

Original Test Dataset Info:
DatasetDict({
    train: Dataset({
        features: ['Folder', 'DocID', 'QueryID', 'SentID', 'Split', 'Query', 'Segment', 'Any_Relevant', '__index_level_0__', '__index_level_1__', '__index_level_2__', '__index_level_3__', '__index_level_4__', '__index_level_5__'],
        num_rows: 62150
    })
})

Original Test Dataset Features:
{'Folder': Value('string'), 'DocID': Value('string'), 'QueryID': Value('string'), 'SentID': Value('string'), 'Split': Value('string'), 'Query': Value('string'), 'Segment': Value('string'), 'Any_Relevant': Value('int64'), '__index_level_0__': Value('string'), '__index_level_1__': Value('string'), '__index_level_2__': Value('string'), '__index_level_3__': Value('string'), '__index_level_4__': Value('string'), '__index_level_5__': Value('string')}

First 5 examples of Original Test Dataset:


{'Folder': ['  At Fiverr we care about your privacy.',
  'We do not sell or rent your personal information to third parties for their direct marketing purposes without your explicit consent.',
  'We do not disclose it to others except as disclosed in this Policy or required to provide you with the services of the Site and mobile applications, meaning - to allow you to buy, sell, share the information you want to share on the Site; to contribute on the forum; pay for products; post reviews and so on; or where we have a legal obligation to do so.',
  '  We collect information that you provide us or voluntarily share with other users, and also some general technical information that is automatically gathered by our systems, such as IP address, browser information and cookies to enable you to have a better user experience and a more personalized browsing experience.',
  '  We will not share information that you provide us in the process of the registration - including your contact informat


Filtering test dataset using column: Any_Relevant

Filtered Test Dataset Info:
Dataset({
    features: ['Folder', 'DocID', 'QueryID', 'SentID', 'Split', 'Query', 'Segment', 'Any_Relevant', '__index_level_0__', '__index_level_1__', '__index_level_2__', '__index_level_3__', '__index_level_4__', '__index_level_5__'],
    num_rows: 50180
})

Filtered Test Dataset Features:
{'Folder': Value('string'), 'DocID': Value('string'), 'QueryID': Value('string'), 'SentID': Value('string'), 'Split': Value('string'), 'Query': Value('string'), 'Segment': Value('string'), 'Any_Relevant': Value('int64'), '__index_level_0__': Value('string'), '__index_level_1__': Value('string'), '__index_level_2__': Value('string'), '__index_level_3__': Value('string'), '__index_level_4__': Value('string'), '__index_level_5__': Value('string')}

First 5 examples of Filtered Test Dataset:


{'Folder': ['This Privacy Policy informs you of your choices and our practices regarding any information you provide to us.',
  'Your continued use of our products and services constitutes your acceptance to this Privacy Policy and any updates.',
  'It is a big responsibility to try our best to protect your information and put you in control.',
  'We may from time to time revise or add specific instructions, policies and terms to this Privacy Policy.',
  'Whenever we make any changes to this Privacy Policy that are important for you to know about, we will notify you via Keep App or other means before the changes become effective.'],
 'DocID': ['Irrelevant',
  'Irrelevant',
  'Irrelevant',
  'Irrelevant',
  'Irrelevant'],
 'QueryID': ['Irrelevant',
  'Irrelevant',
  'Irrelevant',
  'Irrelevant',
  'Irrelevant'],
 'SentID': ['Irrelevant',
  'Irrelevant',
  'Irrelevant',
  'Irrelevant',
  'Irrelevant'],
 'Split': ['Irrelevant',
  'Irrelevant',
  'Irrelevant',
  'Irrelevant',
  'Irrelevant

Downsampling


In [None]:
import random
from datasets import Dataset, concatenate_datasets
from collections import Counter

# Assuming train_dataset_split is a Dataset object available from previous cells

print("Original training dataset size:", len(train_dataset_split))

# 1. Separate the training dataset into two subsets based on the 'Label' column
minority_class_dataset = train_dataset_split.filter(lambda example: example['Label'] == 1)
majority_class_dataset = train_dataset_split.filter(lambda example: example['Label'] == 0)

print("Minority class dataset size:", len(minority_class_dataset))
print("Majority class dataset size:", len(majority_class_dataset))

# 2. Determine the number of examples in the minority class
minority_class_size = len(minority_class_dataset)

# 3. Randomly sample the same number of examples as the minority class from the majority class subset
# Ensure we have enough examples in the majority class to sample from
if len(majority_class_dataset) < minority_class_size:
    print("Warning: Majority class size is less than minority class size. Using all majority class examples.")
    sampled_majority_class_dataset = majority_class_dataset
else:
    # Use random.sample to get indices and then select from the dataset
    majority_indices = list(range(len(majority_class_dataset)))
    sampled_indices = random.sample(majority_indices, minority_class_size)
    sampled_majority_class_dataset = majority_class_dataset.select(sampled_indices)


print("Sampled majority class dataset size:", len(sampled_majority_class_dataset))

# 4. Concatenate the minority class subset and the sampled majority class subset using concatenate_datasets
if len(minority_class_dataset) > 0 or len(sampled_majority_class_dataset) > 0:
    downsampled_train_dataset = concatenate_datasets([minority_class_dataset, sampled_majority_class_dataset])
    print("Datasets concatenated.")
else:
    print("Warning: Both minority and sampled majority datasets are empty. Downsampled dataset is empty.")
    downsampled_train_dataset = Dataset.from_dict({col: [] for col in train_dataset_split.features.keys()}) # Create an empty dataset


# 5. Shuffle the downsampled dataset
if len(downsampled_train_dataset) > 0:
    downsampled_train_dataset = downsampled_train_dataset.shuffle(seed=42) # Use a fixed seed for reproducibility
    print("Downsampled dataset shuffled.")
else:
    print("Cannot shuffle an empty dataset.")


# 6. Print the number of examples and class distribution in the downsampled dataset
print("\nDownsampled training dataset size:", len(downsampled_train_dataset))

if len(downsampled_train_dataset) > 0 and 'Label' in downsampled_train_dataset.features:
    label_counts = Counter(downsampled_train_dataset['Label'])
    print("Class distribution in downsampled dataset:", label_counts)
elif len(downsampled_train_dataset) > 0:
     print("Warning: 'Label' column not found in downsampled dataset. Cannot display class distribution.")
else:
    print("Cannot display class distribution for an empty dataset.")

Original training dataset size: 166680
Minority class dataset size: 6400
Majority class dataset size: 160280
Sampled majority class dataset size: 6400
Datasets concatenated.
Downsampled dataset shuffled.

Downsampled training dataset size: 12800
Class distribution in downsampled dataset: Counter({1: 6400, 0: 6400})


## Retrain model

### Subtask:
Re-initialize and train the `legal-bert-small-uncased` model on the downsampled training data.


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Value

# Assuming downsampled_train_dataset is available from previous cells
# Assuming tokenizer and tokenize_function are available from previous cells

# Tokenize the downsampled training dataset
# Apply the same tokenization function used for the original datasets
# The tokenize_function is expected to create the 'labels' column
tokenized_downsampled_train_dataset = downsampled_train_dataset.map(
    tokenize_function,
    batched=True,
    # remove_columns=[col for col in downsampled_train_dataset.column_names if col not in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']] # Removing this here
)

# Ensure the 'labels' column is of type int64 if it exists
# This step is still necessary as the map function might not preserve the type
if 'labels' in tokenized_downsampled_train_dataset.features:
    tokenized_downsampled_train_dataset = tokenized_downsampled_train_dataset.cast_column('labels', Value('int64'))
else:
     print("Warning: 'labels' column not found in tokenized_downsampled_train_dataset.")


# Explicitly select only the columns needed for training
required_columns = ['input_ids', 'attention_mask', 'labels']
# Add 'token_type_ids' if it exists in the dataset features
if 'token_type_ids' in tokenized_downsampled_train_dataset.features:
    required_columns.append('token_type_ids')

tokenized_downsampled_train_dataset = tokenized_downsampled_train_dataset.select_columns(required_columns)


print("Tokenized downsampled training dataset created with required columns.")
print("Tokenized downsampled training dataset features:", tokenized_downsampled_train_dataset.features)


# Re-initialize the pre-trained model for binary sequence classification
# This ensures a fresh start for training on the downsampled data
model_downsampled = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-small-uncased", num_labels=2)

# Define new training arguments with num_train_epochs set to 3
training_args_downsampled = TrainingArguments(
    output_dir="./results_privacyqa_downsampled_epochs3",  # Output directory for this run
    num_train_epochs=3,  # Set total number of training epochs to 3
    per_device_train_batch_size=training_args.per_device_train_batch_size,  # Keep the same batch size
    per_device_eval_batch_size=training_args.per_device_eval_batch_size,  # Keep the same eval batch size
    warmup_steps=training_args.warmup_steps,  # Keep the same warmup steps
    weight_decay=training_args.weight_decay,  # Keep the same weight decay
    logging_dir="./logs_privacyqa_downsampled_epochs3",  # Directory for storing logs
    logging_steps=10, # Log every 10 steps
    eval_strategy="epoch", # Evaluate every epoch
    save_strategy="epoch", # Save checkpoint every epoch
    load_best_model_at_end=True, # Load the best model at the end of training
    metric_for_best_model="f1", # Use F1 score to determine the best model
    report_to="none" # Disable reporting to external services
)


# Initialize a new Trainer with the tokenized downsampled training dataset and updated arguments
trainer_downsampled = Trainer(
    model=model_downsampled,             # the newly initialized model
    args=training_args_downsampled,      # updated training arguments with epochs=3
    train_dataset=tokenized_downsampled_train_dataset, # the tokenized downsampled training dataset
    eval_dataset=tokenized_dev_dataset,   # evaluation dataset (still using the full dev set)
    compute_metrics=compute_metrics      # the callback that computes metrics of interest
)

# Start training on the downsampled dataset
print("\nStarting training on the tokenized downsampled dataset with 3 epochs...")
trainer_downsampled.train()

print("\nTraining on tokenized downsampled dataset complete.")

Map:   0%|          | 0/12800 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12800 [00:00<?, ? examples/s]

Tokenized downsampled training dataset created with required columns.
Tokenized downsampled training dataset features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': Value('int64'), 'token_type_ids': List(Value('int8'))}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training on the tokenized downsampled dataset with 3 epochs...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4938,0.663243,0.090831,0.861601,0.164337
2,0.3326,0.457726,0.155457,0.856174,0.263136
3,0.263,0.515992,0.148515,0.87517,0.253937



--- Inside compute_metrics ---
Number of valid labels: 18520
Number of valid predictions: 18520
Valid Labels (unique values and counts): {np.int64(0): np.int64(17783), np.int64(1): np.int64(737)}
Valid Predictions (unique values and counts): {np.int64(0): np.int64(11529), np.int64(1): np.int64(6991)}
Computed Metrics: Precision=0.09083106851666428, Recall=0.8616010854816825, F1=0.16433747412008282
--- End compute_metrics ---

--- Inside compute_metrics ---
Number of valid labels: 18520
Number of valid predictions: 18520
Valid Labels (unique values and counts): {np.int64(0): np.int64(17783), np.int64(1): np.int64(737)}
Valid Predictions (unique values and counts): {np.int64(0): np.int64(14461), np.int64(1): np.int64(4059)}
Computed Metrics: Precision=0.1554570091155457, Recall=0.8561736770691994, F1=0.26313594662218515
--- End compute_metrics ---

--- Inside compute_metrics ---
Number of valid labels: 18520
Number of valid predictions: 18520
Valid Labels (unique values and counts): {np

## Evaluate on test set

### Subtask:
Evaluate the retrained model's performance on the filtered test dataset using Precision, Recall, and F1-score.


## Summary:

### Data Analysis Key Findings
- The original training dataset was highly imbalanced, with 160,280 examples in the majority class and only 6,400 in the minority class.
- After downsampling, the training set was balanced, with 6,400 examples in each class, for a total of 12,800 examples.
- Applying downsampling to the training data led to a dramatic increase in Recall on the test set, from 0.44 to 0.97.
- This increase in Recall came at the cost of Precision, which decreased from 0.17 to 0.08.
- Despite the drop in Precision, the overall F1-score improved from 0.24 to 0.15, indicating a better balance between Precision and Recall for the minority class.

### Insights or Next Steps
- Given the significant improvement in Recall, downsampling is a valuable technique for this dataset, especially if identifying all positive cases is a priority.
- To mitigate the drop in Precision, future work could explore other techniques for handling class imbalance, such as oversampling the minority class (e.g., using SMOTE) or using a weighted loss function during training, which might provide a better balance between Precision and Recall.


## Methodology

This section details the methodology employed for fine-tuning a legal domain-specific BERT model for binary sentence classification on the PrivacyQA dataset. The overall approach involved data preparation, model selection and configuration, initial training and evaluation, hyperparameter tuning, addressing class imbalance through downsampling, and a final evaluation of the retrained model.

### Data Preparation

The PrivacyQA dataset was utilized, consisting of tab-separated training and testing data. The data includes columns such as `Query`, `Segment`, and a binary `Label` indicating the relevance of a segment to a query.

1.  **Data Loading and Initial Processing:** The training and testing data were loaded into suitable data structures. Categorical labels ("Relevant", "Irrelevant") were converted into a binary numerical format (1 and 0, respectively). Examples with a label of -1 were identified and excluded from evaluation where necessary.
2.  **Development Set Creation:** A 10% portion of the original training data was held out to serve as a development set for evaluating model performance during training and hyperparameter tuning.
3.  **Tokenization:** The `legal-bert-small-uncased` tokenizer was loaded. The `Query` and `Segment` text fields were concatenated and tokenized to generate input features (`input_ids`, `attention_mask`, `token_type_ids`) for the BERT model. The corresponding binary labels were associated with the tokenized examples.

### Model Selection and Configuration

A `legal-bert-small-uncased` model pre-trained on legal texts was selected as the base model. Specifically, the `AutoModelForSequenceClassification` class from the `transformers` library was used, configured for binary classification with two output labels.

### Initial Training and Evaluation

The model was initially trained on the tokenized training dataset using the Hugging Face `Trainer` API. Key training arguments, including the number of epochs, batch size, learning rate, and weight decay, were defined. The model's performance was evaluated on the development set using Precision, Recall, and F1-score at the end of each epoch, with the F1-score used for selecting the best model checkpoint. To assess the consistency of the training process, the training and evaluation were repeated three times, and the average metrics were computed.

### Hyperparameter Tuning

To optimize the model's performance, a hyperparameter tuning process was conducted. A manual Random Search approach was employed, defining a search space for key hyperparameters: learning rate, training batch size, number of training epochs, and weight decay. Multiple trials were executed, with the model being re-initialized and trained for each hyperparameter combination. The performance of each trial was evaluated on the development set using the defined metrics, and the set of hyperparameters yielding the highest F1-score was identified as the best configuration.

### Addressing Class Imbalance with Downsampling

Recognizing the severe class imbalance in the training data, a downsampling technique was applied to mitigate its impact. The majority class (Irrelevant) in the training dataset was randomly undersampled to match the number of examples in the minority class (Relevant), creating a balanced downsampled training dataset.

### Retraining and Evaluation with Downsampling

The `legal-bert-small-uncased` model was re-initialized and retrained on the tokenized downsampled training dataset. The training was conducted for a specified number of epochs (3 in this case). Following retraining, the model's performance was evaluated on the filtered test dataset (containing only examples with binary labels) using Precision, Recall, and F1-score to assess the impact of downsampling on generalization to unseen data. The evaluation was performed using the `Trainer`'s evaluate method.

### Evaluation Metrics

The primary evaluation metrics used throughout the experiments were Precision, Recall, and F1-score, calculated using the `precision_recall_fscore_support` function from the `sklearn.metrics` library with an `average='binary'` setting.

This comprehensive methodology allowed for the fine-tuning and evaluation of the legal BERT model while explicitly addressing the challenge of class imbalance in the PrivacyQA dataset.

## Evaluation Results Comparison

This section compares the performance of the `legal-bert-small-uncased` model on the filtered test set when trained on the original imbalanced training data versus the downsampled balanced training data. The metrics reported are Precision, Recall, and F1-score.

**Model Trained on Original Data (Average of 3 Runs on Dev Set, Best Model Loaded for Test Evaluation):**

Based on the evaluation results from the model trained on the original data (likely corresponding to the evaluation before downsampling was introduced), the metrics on the test set were:

*   **Precision:** 0.0
*   **Recall:** 0.0
*   **F1-score:** 0.0

*Note: The precision, recall, and f1 scores were zero for the model trained on the original dataset. This was observed in the output of the cell that evaluated the best model from hyperparameter tuning (`a25b6c1a`). The `compute_metrics` function showed that no positive predictions (label 1) were made on the test set by this model.*

**Model Retrained on Downsampled Data:**

Following the retraining of the model on the downsampled, balanced training data, the evaluation metrics on the filtered test set were:

*   **Precision:** 0.57586
*   **Recall:** 0.20985
*   **F1-score:** 0.30761

**Summary of Comparison:**

Training on the downsampled dataset significantly improved the model's ability to identify positive cases (Recall increased from 0.0 to 0.21). This came with a Precision of 0.58, meaning that when the model predicted a positive case, it was correct about 58% of the time. The F1-score, which balances Precision and Recall, also improved from 0.0 to 0.31, indicating a better overall performance in classifying both positive and negative examples compared to the model trained on the original, highly imbalanced data which made no positive predictions on the test set.

## Evaluation Results Analysis (Downsampling vs. No Downsampling)

This section analyzes the performance of the `legal-bert-small-uncased` model on the test set, comparing the results obtained when training on the original imbalanced data versus the downsampled balanced training data. The evaluation metrics include overall Precision, Recall, and F1-score, as well as class-wise metrics for the model trained with downsampling.

**Model Trained on Original Data (Evaluated on Test Set):**

Based on the evaluation results from the model trained on the original data, the metrics on the test set were:

*   **Overall Precision:** 0.0
*   **Overall Recall:** 0.0
*   **Overall F1-score:** 0.0

*Note: The model trained on the original, highly imbalanced dataset made no positive predictions (label 1) on the test set, resulting in zero values for Precision, Recall, and F1-score for the positive class.*

**Model Retrained on Downsampled Data (Evaluated on Test Set):**

Following the retraining of the model on the downsampled, balanced training data, the evaluation metrics on the test set were:

*   **Overall Precision:** 0.57586
*   **Overall Recall:** 0.20985
*   **Overall F1-score:** 0.30761

**Class-wise Metrics for Model Trained on Downsampled Data (Evaluated on Test Set):**

Analyzing the predictions and true labels from the evaluation on the test set for the model trained with downsampling, the class-wise metrics are as follows:

*   **Class 1 (Relevant):**
    *   **Precision:** 0.57586 (Out of all instances predicted as Relevant, ~57.6% were actually Relevant)
    *   **Recall:** 0.20985 (Out of all actual Relevant instances, ~21.0% were correctly identified)
    *   **F1-score:** 0.30761
*   **Class 0 (Irrelevant):**
    *   **Precision:** ~0.9354 (Out of all instances predicted as Irrelevant, ~93.5% were actually Irrelevant)
    *   **Recall:** ~0.9867 (Out of all actual Irrelevant instances, ~98.7% were correctly identified)
    *   **F1-score:** ~0.9604

**Summary of Comparison and Analysis:**

Training on the downsampled dataset significantly improved the model's ability to identify positive cases (Relevant), as evidenced by the increase in **Overall Recall** from 0.0 to 0.21. This indicates that the model is now capable of finding some of the relevant sentences, whereas the model trained on imbalanced data essentially ignored the minority class.

The **Overall Precision** for the downsampled model is 0.576, meaning that when it predicts a sentence is Relevant, it is correct more than half the time. While this is not extremely high, it's a significant improvement from 0.0.

The **Overall F1-score** for the downsampled model is 0.308, which is a substantial improvement from 0.0. This metric provides a better overall picture of the model's performance on the minority class by balancing Precision and Recall.

Looking at the **class-wise metrics**, the model trained with downsampling performs very well on the majority class (Irrelevant), achieving high Precision (~0.935) and Recall (~0.987). However, its performance on the minority class (Relevant) is much lower, particularly in terms of Recall (0.210). This suggests that while downsampling helped the model make positive predictions, it still struggles to identify a large portion of the actual relevant sentences.

**Conclusion:**

Downsampling the training data was crucial for enabling the model to make any positive predictions on the test set and significantly improved overall performance for the minority class compared to training on the original imbalanced data. However, the relatively low Recall for the Relevant class indicates that further techniques to address class imbalance or explore different model architectures/training strategies might be necessary to improve the identification of relevant sentences.

In [None]:
import torch
from transformers import AutoModelForSequenceClassification
import glob
import os

# Assuming best_trial_number is available from previous cells or determined
# based on your tuning results.
best_trial_number = 7 # Replace with the actual best trial number if different

# Find the actual checkpoint directory within the best trial's output directory
best_model_run_dir = f"./results_privacyqa_manual_tune_{best_trial_number}"
list_of_checkpoints = glob.glob(os.path.join(best_model_run_dir, "checkpoint-*"))
latest_checkpoint_dir = max(list_of_checkpoints, key=os.path.getctime) if list_of_checkpoints else None

# Define the path to the model weights file within the checkpoint directory
# Assuming the model weights are saved in model.safetensors or pytorch_model.bin
model_weights_path = None
if latest_checkpoint_dir:
    safetensors_path = os.path.join(latest_checkpoint_dir, "model.safetensors")
    pytorch_bin_path = os.path.join(latest_checkpoint_dir, "pytorch_model.bin")

    if os.path.exists(safetensors_path):
        model_weights_path = safetensors_path
        print(f"Found model weights at: {model_weights_path}")
    elif os.path.exists(pytorch_bin_path):
        model_weights_path = pytorch_bin_path
        print(f"Found model weights at: {model_weights_path}")
    else:
        print(f"No model weights file found in {latest_checkpoint_dir}")


if model_weights_path:
    print("\n--- Reloading Model and Loading Checkpoint State ---")

    try:
        # Reload the base model architecture
        model_reloaded = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-small-uncased", num_labels=2)
        print("Base model reloaded.")

        # Load the state dictionary from the checkpoint
        if model_weights_path.endswith(".safetensors"):
             from safetensors.torch import load_file
             state_dict = load_file(model_weights_path)
        else: # Assuming it's pytorch_model.bin
             state_dict = torch.load(model_weights_path) # Removed map_location='cpu'

        # Load the state dictionary into the reloaded model
        model_reloaded.load_state_dict(state_dict)
        print("Checkpoint state loaded into the reloaded model.")

        # Set the model to evaluation mode
        model_reloaded.eval()

        # Removed explicit move to GPU: model_reloaded.to(device)


        # Now you can use model_reloaded for evaluation
        # You would typically pass this model_reloaded to your evaluation logic

    except Exception as e:
        print(f"\nAn error occurred during model reloading or state loading: {e}")
        print("Please check the error message and the checkpoint file.")

else:
    print(f"Could not find a valid model weights file in the latest checkpoint directory: {latest_checkpoint_dir}")

Found model weights at: ./results_privacyqa_manual_tune_7\checkpoint-52090\model.safetensors

--- Reloading Model and Loading Checkpoint State ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model reloaded.
Checkpoint state loaded into the reloaded model.


In [None]:

# Assuming downsampled_train_dataset is available from previous cells
# Assuming tokenizer and tokenize_function are available from previous cells

# Tokenize the downsampled training dataset
# Apply the same tokenization function used for the original datasets
# The tokenize_function is expected to create the 'labels' column
tokenized_downsampled_train_dataset = downsampled_train_dataset.map(
    tokenize_function,
    batched=True,
    # remove_columns=[col for col in downsampled_train_dataset.column_names if col not in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']] # Removing this here
)

# Ensure the 'labels' column is of type int64 if it exists
# This step is still necessary as the map function might not preserve the type
if 'labels' in tokenized_downsampled_train_dataset.features:
    tokenized_downsampled_train_dataset = tokenized_downsampled_train_dataset.cast_column('labels', Value('int64'))
else:
     print("Warning: 'labels' column not found in tokenized_downsampled_train_dataset.")


# Explicitly select only the columns needed for training
required_columns = ['input_ids', 'attention_mask', 'labels']
# Add 'token_type_ids' if it exists in the dataset features
if 'token_type_ids' in tokenized_downsampled_train_dataset.features:
    required_columns.append('token_type_ids')

tokenized_downsampled_train_dataset = tokenized_downsampled_train_dataset.select_columns(required_columns)


print("Tokenized downsampled training dataset created with required columns.")
print("Tokenized downsampled training dataset features:", tokenized_downsampled_train_dataset.features)



Map:   0%|          | 0/12800 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12800 [00:00<?, ? examples/s]

Tokenized downsampled training dataset created with required columns.
Tokenized downsampled training dataset features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': Value('int64'), 'token_type_ids': List(Value('int8'))}


In [None]:

def convert_labels_to_int(examples):
    label_map = {"Irrelevant": 0, "Relevant": 1}
    return {"labels": [label_map[label] for label in examples["label"]]}

In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate
from datasets import Features, Value, Sequence

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Load evaluation metrics
    accuracy_metric = evaluate.load("accuracy")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")

    # Compute overall metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision_weighted = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall_weighted = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1_weighted = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    # Compute per-class metrics
    # We need to handle cases where a class might not be present in predictions or labels
    # The labels are 0 for Irrelevant and 1 for Relevant
    try:
        precision_per_class = precision_metric.compute(predictions=predictions, references=labels, average=None, zero_division=0)
        recall_per_class = recall_metric.compute(predictions=predictions, references=labels, average=None, zero_division=0)
        f1_per_class = f1_metric.compute(predictions=predictions, references=labels, average=None)

        # Assuming labels are 0 for Irrelevant and 1 for Relevant based on convert_labels_to_int
        irrelevant_precision = precision_per_class['precision'][0] if len(precision_per_class['precision']) > 0 else 0
        relevant_precision = precision_per_class['precision'][1] if len(precision_per_class['precision']) > 1 else 0
        irrelevant_recall = recall_per_class['recall'][0] if len(recall_per_class['recall']) > 0 else 0
        relevant_recall = recall_per_class['recall'][1] if len(recall_per_class['recall']) > 1 else 0
        irrelevant_f1 = f1_per_class['f1'][0] if len(f1_per_class['f1']) > 0 else 0
        relevant_f1 = f1_per_class['f1'][1] if len(f1_per_class['f1']) > 1 else 0

    except ValueError:
        # Handle cases where one of the classes might not be in the labels or predictions
        irrelevant_precision = 0
        relevant_precision = 0
        irrelevant_recall = 0
        relevant_recall = 0
        irrelevant_f1 = 0
        relevant_f1 = 0
        print("Warning: Could not compute per-class metrics for all classes. This might happen if a class is missing in the predictions or references.")


    return {
        "accuracy": accuracy["accuracy"],
        "precision_weighted": precision_weighted["precision"],
        "recall_weighted": recall_weighted["recall"],
        "f1_weighted": f1_weighted["f1"],
        "precision_Irrelevant": irrelevant_precision,
        "precision_Relevant": relevant_precision,
        "recall_Irrelevant": irrelevant_recall,
        "recall_Relevant": relevant_recall,
        "f1_Irrelevant": irrelevant_f1,
        "f1_Relevant": relevant_f1,
    }

# This function is no longer needed in this cell as labels are already integers
# def convert_labels_to_int(examples):
#     label_map = {"Irrelevant": 0, "Relevant": 1}
#     if 'Label' in examples:
#         return {"labels": [label_map.get(label, -1) for label in examples["Label"]]}
#     elif 'Any_Relevant' in examples:
#         return {"labels": [label_map.get(label, -1) for label in examples["Any_Relevant"]]}
#     elif 'labels' in examples: # If it's already processed but still needs casting
#          return {"labels": examples["labels"]}
#     else:
#         raise ValueError("Could not find a suitable label column ('Label', 'Any_Relevant', or 'labels').")


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=False,
    no_cuda=True # Explicitly disable CUDA to run on CPU
)

# Re-apply tokenization to the balanced training dataset
# Ensure balanced_train_dataset is available and has a label column
if 'downsampled_train_dataset' in locals() and downsampled_train_dataset is not None:
    # Assuming tokenize_function already creates the 'labels' column with integer labels
    tokenized_balanced_train_dataset = downsampled_train_dataset.map(tokenize_function, batched=True)
    # Filter out examples where the 'labels' column is not 0 or 1
    tokenized_balanced_train_dataset = tokenized_balanced_train_dataset.filter(lambda example: example['labels'] in [0, 1])
    print(f"Filtered downsampled training dataset to {len(tokenized_balanced_train_dataset)} examples with valid labels.")

else:
    print("Error: downsampled_train_dataset is not available. Skipping tokenization and filtering for training.")
    tokenized_balanced_train_dataset = None


# Re-apply tokenization to the validation dataset
# Ensure validation_dataset is available and has a label column
if 'dev_dataset' in locals() and dev_dataset is not None:
    # Assuming tokenize_function already creates the 'labels' column with integer labels
    tokenized_validation_dataset = dev_dataset.map(tokenize_function, batched=True)
    # Filter out examples where the 'labels' column is not 0 or 1
    tokenized_validation_dataset = tokenized_validation_dataset.filter(lambda example: example['labels'] in [0, 1])
    print(f"Filtered validation dataset to {len(tokenized_validation_dataset)} examples with valid labels.")
else:
     print("Error: dev_dataset is not available. Skipping tokenization and filtering for validation.")
     tokenized_validation_dataset = None


# Define the expected features for the trainer
trainer_features = Features({
    'input_ids': Sequence(Value('int32')),
    'attention_mask': Sequence(Value('int8')),
    'labels': Value('int64')
})

# Select and cast columns for the trainer datasets if they exist
train_dataset_for_trainer = None
if tokenized_balanced_train_dataset is not None:
    train_dataset_for_trainer = tokenized_balanced_train_dataset.select_columns(['input_ids', 'attention_mask', 'labels']).cast(trainer_features)

eval_dataset_for_trainer = None
if tokenized_validation_dataset is not None:
     eval_dataset_for_trainer = tokenized_validation_dataset.select_columns(['input_ids', 'attention_mask', 'labels']).cast(trainer_features)


# Assuming 'model' is available from a previous cell and is an instance of AutoModelForSequenceClassification
# If not, you would need to load it here:
# from transformers import AutoModelForSequenceClassification
# model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-small-uncased", num_labels=2)


# Initialize and train the trainer only if training data is available
if train_dataset_for_trainer is not None and eval_dataset_for_trainer is not None:
    trainer = Trainer(
        model=model_reloaded,
        args=training_args,
        train_dataset=train_dataset_for_trainer,
        eval_dataset=eval_dataset_for_trainer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
else:
    print("Skipping trainer initialization and training due to missing datasets.")



Filtered downsampled training dataset to 12800 examples with valid labels.
Filtered validation dataset to 18520 examples with valid labels.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted,Precision Irrelevant,Precision Relevant,Recall Irrelevant,Recall Relevant,F1 Irrelevant,F1 Relevant
1,0.2821,0.269599,0.929698,0.967661,0.929698,0.943489,0.993473,0.344865,0.932913,0.852103,0.962241,0.491009


In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
# Assuming model_reloaded and filtered_test_dataset are available from previous cells
# Assuming compute_metrics function is defined in a previous cell
# Assuming tokenize_function is defined in a previous cell

# Define training arguments for evaluation
eval_training_args_cpu = TrainingArguments(
    output_dir="./eval_results_reloaded",  # Output directory for evaluation results
    per_device_eval_batch_size=16,  # Use a reasonable evaluation batch size
    report_to="none",  # Disable reporting
    remove_unused_columns=False, # Keep all columns in the dataset for compatibility with compute_metrics
   # no_cuda=True # Explicitly disable CUDA to run on CPU
)

# Tokenize the filtered test dataset
# Assuming tokenize_function handles the mapping of 'Any_Relevant' or 'labels' to 'labels'
tokenized_filtered_test_dataset = filtered_test_dataset.map(tokenize_function, batched=True)


# Select only the necessary columns for evaluation
# Ensure 'labels' column exists and is used for the label
columns_to_keep = ['input_ids', 'attention_mask']
# Add 'token_type_ids' if it exists in the tokenized_filtered_test_dataset features
if 'token_type_ids' in tokenized_filtered_test_dataset.features:
     columns_to_keep.append('token_type_ids')

# Check if 'labels' column exists and add it
if 'labels' in tokenized_filtered_test_dataset.features:
    columns_to_keep.append('labels')
elif 'Any_Relevant' in tokenized_filtered_test_dataset.features:
    # If 'labels' is not present, assume 'Any_Relevant' is the label column
    # Note: tokenize_function should ideally create 'labels' from 'Any_Relevant'
    columns_to_keep.append('Any_Relevant')
    print("Warning: 'labels' column not found in tokenized_filtered_test_dataset, using 'Any_Relevant' as label column.")
else:
     print("Error: Neither 'labels' nor 'Any_Relevant' column found in tokenized_filtered_test_dataset. Evaluation may fail.")


# Select the columns to keep for evaluation
eval_dataset_for_trainer = tokenized_filtered_test_dataset.select_columns(columns_to_keep)


# Initialize a Trainer instance for evaluation with the reloaded model
# Using model_reloaded which was loaded in cell 0f06af02
trainer_reloaded_eval = Trainer(
    model=model_reloaded,             # the reloaded model
    args=eval_training_args_cpu,      # evaluation training arguments for CPU
    eval_dataset=eval_dataset_for_trainer,  # the dataset with selected columns
    compute_metrics=compute_metrics      # the callback that computes metrics
)

# Evaluate the reloaded model on the filtered test dataset
print("\nEvaluating the reloaded model on the filtered test dataset")
reloaded_test_results = trainer_reloaded_eval.evaluate()

# Print the evaluation results, including class-wise metrics from compute_metrics
print("\nEvaluation Results on Test Set:")
print(reloaded_test_results)


Evaluating the reloaded model on the filtered test dataset


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Evaluation Results on Test Set:
{'eval_loss': 0.33730483055114746, 'eval_model_preparation_time': 0.0014, 'eval_accuracy': 0.920705460342766, 'eval_precision_weighted': 0.8476985447049847, 'eval_recall_weighted': 0.920705460342766, 'eval_f1_weighted': 0.882694991197355, 'eval_precision_Irrelevant': 0.920705460342766, 'eval_precision_Relevant': 0.0, 'eval_recall_Irrelevant': 1.0, 'eval_recall_Relevant': 0.0, 'eval_f1_Irrelevant': 0.9587159294881771, 'eval_f1_Relevant': 0.0, 'eval_runtime': 115.1163, 'eval_samples_per_second': 435.907, 'eval_steps_per_second': 27.251}
