# Fine-Tuning and Evaluating DistilBERT for Binary Classification on a TensorFlow Dataset

# Import Necessary Libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Load and Combine Datasets

In [2]:
# Load all datasets (ARB, BOH, NAM, UNK have bugs; nonbug.csv does not have bugs)
datasets = ['ARB.csv', 'BOH.csv', 'NAM.csv', 'UNK.csv', 'nonbug.csv']
dfs = []

# Read and combine all datasets
for dataset in datasets:
    df = pd.read_csv(dataset)
    # Assign bug label based on the dataset filename
    label = 1 if dataset != 'nonbug.csv' else 0  # 1 for files with bugs, 0 for nonbug.csv
    df['label'] = label
    dfs.append(df)

# Combine all datasets into a single DataFrame
df_combined = pd.concat(dfs, ignore_index=True)


#Data Preparation


In [3]:
# Check for missing values and handle them
df_combined = df_combined.dropna(subset=['title', 'summary', 'comments'])

# Combine summary and comments into a single text column for the model input
df_combined['text'] = df_combined['summary'] + " " + df_combined['comments']

# Split dataset into training (80%) and test (20%) sets
train_data, test_data = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['label'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined['text'] = df_combined['summary'] + " " + df_combined['comments']


# Model and Tokenizer Setup


In [4]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 128

def tokenize_data(data):
    return tokenizer(
        data['text'].tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_data)
test_encodings = tokenize_data(test_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Custom Dataset Class


In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = CustomDataset(train_encodings, train_data['label'].tolist())
test_dataset = CustomDataset(test_encodings, test_data['label'].tolist())


# Compute Metrics Function


In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    eval_accuracy = accuracy_score(labels, preds)
    eval_f1 = f1_score(labels, preds, average='weighted')

    # Print classification report only once at the end
    if pred.label_ids is not None:
        print("\nClassification Report:")
        print(classification_report(labels, preds, target_names=['No Bug', 'Bug']))

    return {
        'accuracy': eval_accuracy,
        'f1': eval_f1
    }


# Training Arguments


In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    greater_is_better=True
)




# Initialize Model and Trainer


In [8]:
# Initialize model for binary classification (2 classes: Bug or No Bug)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training the Model


In [9]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6832,0.683936,0.57868,0.558609
2,0.7379,0.676111,0.586294,0.574811
3,0.6501,0.672566,0.593909,0.588438



Classification Report:
              precision    recall  f1-score   support

      No Bug       0.57      0.78      0.66       205
         Bug       0.60      0.36      0.45       189

    accuracy                           0.58       394
   macro avg       0.59      0.57      0.55       394
weighted avg       0.58      0.58      0.56       394


Classification Report:
              precision    recall  f1-score   support

      No Bug       0.58      0.74      0.65       205
         Bug       0.60      0.42      0.49       189

    accuracy                           0.59       394
   macro avg       0.59      0.58      0.57       394
weighted avg       0.59      0.59      0.57       394


Classification Report:
              precision    recall  f1-score   support

      No Bug       0.59      0.70      0.64       205
         Bug       0.60      0.48      0.53       189

    accuracy                           0.59       394
   macro avg       0.59      0.59      0.59       394
we

TrainOutput(global_step=591, training_loss=0.6716184008948493, metrics={'train_runtime': 88.9457, 'train_samples_per_second': 53.021, 'train_steps_per_second': 6.645, 'total_flos': 156179063015424.0, 'train_loss': 0.6716184008948493, 'epoch': 3.0})

# Final Evaluation on Test Set


In [10]:
print("\nFinal Evaluation on Test Set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test Results:", test_results)



Final Evaluation on Test Set:



Classification Report:
              precision    recall  f1-score   support

      No Bug       0.59      0.70      0.64       205
         Bug       0.60      0.48      0.53       189

    accuracy                           0.59       394
   macro avg       0.59      0.59      0.59       394
weighted avg       0.59      0.59      0.59       394

Test Results: {'eval_loss': 0.6725663542747498, 'eval_accuracy': 0.5939086294416244, 'eval_f1': 0.5884379132363606, 'eval_runtime': 1.4132, 'eval_samples_per_second': 278.802, 'eval_steps_per_second': 35.381, 'epoch': 3.0}


## Making Predictions on the Test Set


In [11]:
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

test_data['Predicted_Label'] = pred_labels
print(test_data[['title', 'summary', 'comments', 'label', 'Predicted_Label']].head())



Classification Report:
              precision    recall  f1-score   support

      No Bug       0.59      0.70      0.64       205
         Bug       0.60      0.48      0.53       189

    accuracy                           0.59       394
   macro avg       0.59      0.59      0.59       394
weighted avg       0.59      0.59      0.59       394

                                                  title  \
1825  Custom Optimizer keeps throwing no attribute c...   
968                        Error running example on gpu   
1053         Slow Adam sparse updates in distributed TF   
677   model.fit generator multithreading is broken i...   
74    TF2.0 Multiple calls to Keras .fit and .evalua...   

                                                summary  \
1825   System information Have I written custom code...   
968   Running bazel bin tensorflow cc tutorials exam...   
1053  I am trying to train a model with the tf.nn.em...   
677    System information Have I written custom code...   