# Fine-Tuning and Evaluating RoBERTa base model for Binary Classification on a TensorFlow Dataset

# Import Necessary Libraries

In [1]:
# Install necessary libraries
!pip install -q transformers datasets scikit-learn huggingface_hub pandas nltk

# Import libraries
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
import torch.nn.functional as F
import torch

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m23.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency r

# Load and Combine Datasets

In [2]:

# Load all datasets (ARB, BOH, NAM, UNK have bugs; nonbug.csv does not have bugs)
datasets = ['ARB.csv', 'BOH.csv', 'NAM.csv', 'UNK.csv', 'nonbug.csv']
dfs = []

# Read and combine all datasets
for dataset in datasets:
    df = pd.read_csv(dataset)
    # Assign bug label based on the dataset filename
    label = 1 if dataset != 'nonbug.csv' else 0  # 1 for files with bugs, 0 for nonbug.csv
    df['label'] = label
    dfs.append(df)

# Combine all datasets into a single DataFrame
df_combined = pd.concat(dfs, ignore_index=True)

# Check for missing values and handle them
df_combined = df_combined.dropna(subset=['title', 'summary', 'comments'])

# Combine summary and comments into a single text column for the model input
df_combined['text'] = df_combined['summary'].astype(str) + " " + df_combined['comments'].astype(str)




# Split Dataset and Augment Training Data

In [3]:
# Split dataset into training (80%) and test (20%) sets
train_data, test_data = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['label'])

# Check dataset sizes
print(f"\nTraining Dataset Size: {len(train_data)}")
print(f"Testing Dataset Size: {len(test_data)}")

# Check the distribution of labels (1 and 0) in the training dataset
train_class_distribution = train_data['label'].value_counts()

print("\nClass Distribution in Training Dataset:")
print(train_class_distribution)


Training Dataset Size: 1572
Testing Dataset Size: 394

Class Distribution in Training Dataset:
label
0    818
1    754
Name: count, dtype: int64


# Tokenize Data

In [4]:
# Load tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize text and extract necessary fields
train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)

# Add labels to the tokenized data
train_encodings['labels'] = train_data['label'].values
test_encodings['labels'] = test_data['label'].values

# Convert to Hugging Face datasets
train_dataset = Dataset.from_dict(train_encodings)
test_dataset = Dataset.from_dict(test_encodings)

# Compute class weights to handle class imbalance
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=train_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Load the RoBERTa model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define Custom Dataset Class

In [5]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        return (loss, outputs) if return_outputs else loss


# Define Metrics Function

In [6]:
# Custom metric function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predicted_labels = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predicted_labels)
    f1 = f1_score(labels, predicted_labels, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}


# Set Training Arguments

In [7]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none"
)



# Initialize and Train Model

In [8]:
# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights.to(training_args.device)
)

# Train the model
trainer.train()

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6821,0.691226,0.543147,0.516724
2,0.7092,0.690632,0.520305,0.497298
3,0.6945,0.688793,0.538071,0.538214


TrainOutput(global_step=591, training_loss=0.6962800966906668, metrics={'train_runtime': 182.5209, 'train_samples_per_second': 25.838, 'train_steps_per_second': 3.238, 'total_flos': 310207934269440.0, 'train_loss': 0.6962800966906668, 'epoch': 3.0})

# Evaluate Model

In [9]:
# Evaluate the model after training
def evaluate_model():
    predictions = trainer.predict(test_dataset)
    predicted_labels = predictions.predictions.argmax(axis=1)
    true_labels = predictions.label_ids

    # Print Accuracy and F1 score
    print(f"Accuracy: {accuracy_score(true_labels, predicted_labels) * 100:.2f}%")
    print(f"Classification Report:\n{classification_report(true_labels, predicted_labels)}")

# Evaluate the model
evaluate_model()


Accuracy: 53.81%
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.55      0.55       205
           1       0.52      0.53      0.52       189

    accuracy                           0.54       394
   macro avg       0.54      0.54      0.54       394
weighted avg       0.54      0.54      0.54       394

