# Fine-Tuning and Evaluating DistilBERT for Binary Classification on a TensorFlow Augmented Dataset

# Import Necessary Libraries

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import random
import nltk

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Define Synonym Replacement Functions

In [6]:
# Function to replace a random word in a sentence with its synonym
def synonym_replacement(sentence, n=1):
    if not sentence.strip():  # Handle empty sentences
        return sentence

    words = word_tokenize(sentence)
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_words)

# Function to get synonyms of a word
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name().lower() != word.lower():
                synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)


# Load and Combine Datasets

In [7]:
# Load all datasets (ARB, BOH, NAM, UNK have bugs; nonbug.csv does not have bugs)
datasets = ['ARB.csv', 'BOH.csv', 'NAM.csv', 'UNK.csv', 'nonbug.csv']
dfs = []

# Read and combine all datasets
for dataset in datasets:
    df = pd.read_csv(dataset)
    # Assign bug label based on the dataset filename
    label = 1 if dataset != 'nonbug.csv' else 0  # 1 for files with bugs, 0 for nonbug.csv
    df['label'] = label
    dfs.append(df)

# Combine all datasets into a single DataFrame
df_combined = pd.concat(dfs, ignore_index=True)

# Check for missing values and handle them
df_combined = df_combined.dropna(subset=['title', 'summary', 'comments'])

# Combine summary and comments into a single text column for the model input
df_combined['text'] = df_combined['summary'].astype(str) + " " + df_combined['comments'].astype(str)


# Split Dataset and Augment Training Data

In [8]:
# Split dataset into training (80%) and test (20%) sets
train_data, test_data = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['label'])

# Augment the training data by creating 5 variations for each row
augmented_train_data = []
for _, row in train_data.iterrows():
    augmented_train_data.append(row)
    for _ in range(5):  # Create 5 augmented versions
        augmented_text = synonym_replacement(row['text'])
        new_row = row.copy()
        new_row['text'] = augmented_text
        augmented_train_data.append(new_row)

# Convert the augmented data to a DataFrame
train_data_augmented = pd.DataFrame(augmented_train_data)

# Check the sizes of training and testing datasets after augmentation
print(f"Original Training Set Size: {len(train_data)}")
print(f"Augmented Training Set Size: {len(train_data_augmented)}")
print(f"Testing Set Size: {len(test_data)}")



Original Training Set Size: 1572
Augmented Training Set Size: 9432
Testing Set Size: 394


# Tokenize Data

In [9]:
# Model and tokenizer setup
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
max_length = 128

def tokenize_data(data):
    return tokenizer(
        list(data['text']),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_data_augmented)
test_encodings = tokenize_data(test_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Define Custom Dataset Class

In [10]:
# Custom Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = CustomDataset(train_encodings, train_data_augmented['label'].astype(int).tolist())
test_dataset = CustomDataset(test_encodings, test_data['label'].astype(int).tolist())


# Define Metrics Function

In [11]:
# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    eval_accuracy = accuracy_score(labels, preds)
    eval_f1 = f1_score(labels, preds, average='weighted')

    # Print classification report only once at the end
    print("\nClassification Report:")
    print(classification_report(labels, preds, target_names=['No Bug', 'Bug']))

    return {
        'accuracy': eval_accuracy,
        'f1': eval_f1
    }


# Set Training Arguments

In [12]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    greater_is_better=True
)




# Initialize and Train Model

In [13]:
# Initialize model for binary classification (2 classes: Bug or No Bug)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2776,1.035136,0.621827,0.601452
2,0.2792,1.777473,0.606599,0.597038
3,0.1008,2.061319,0.576142,0.573669



Classification Report:
              precision    recall  f1-score   support

      No Bug       0.60      0.83      0.70       205
         Bug       0.69      0.39      0.50       189

    accuracy                           0.62       394
   macro avg       0.64      0.61      0.60       394
weighted avg       0.64      0.62      0.60       394


Classification Report:
              precision    recall  f1-score   support

      No Bug       0.60      0.75      0.67       205
         Bug       0.62      0.45      0.52       189

    accuracy                           0.61       394
   macro avg       0.61      0.60      0.59       394
weighted avg       0.61      0.61      0.60       394


Classification Report:
              precision    recall  f1-score   support

      No Bug       0.58      0.65      0.61       205
         Bug       0.57      0.50      0.53       189

    accuracy                           0.58       394
   macro avg       0.57      0.57      0.57       394
we

TrainOutput(global_step=3537, training_loss=0.29065332115370573, metrics={'train_runtime': 415.8844, 'train_samples_per_second': 68.038, 'train_steps_per_second': 8.505, 'total_flos': 937074378092544.0, 'train_loss': 0.29065332115370573, 'epoch': 3.0})

# Evaluate Model

In [14]:
# Evaluate the model on test data
print("\nFinal Evaluation on Test Set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test Results:", test_results)



Final Evaluation on Test Set:



Classification Report:
              precision    recall  f1-score   support

      No Bug       0.60      0.83      0.70       205
         Bug       0.69      0.39      0.50       189

    accuracy                           0.62       394
   macro avg       0.64      0.61      0.60       394
weighted avg       0.64      0.62      0.60       394

Test Results: {'eval_loss': 1.0351359844207764, 'eval_accuracy': 0.6218274111675127, 'eval_f1': 0.6014518837929054, 'eval_runtime': 1.432, 'eval_samples_per_second': 275.137, 'eval_steps_per_second': 34.916, 'epoch': 3.0}


# Make Predictions

In [15]:
# Making predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

# Output predicted labels for each test example
test_data['Predicted_Label'] = pred_labels
print(test_data[['title', 'summary', 'comments', 'label', 'Predicted_Label']].head())



Classification Report:
              precision    recall  f1-score   support

      No Bug       0.60      0.83      0.70       205
         Bug       0.69      0.39      0.50       189

    accuracy                           0.62       394
   macro avg       0.64      0.61      0.60       394
weighted avg       0.64      0.62      0.60       394

                                                  title  \
1825  Custom Optimizer keeps throwing no attribute c...   
968                        Error running example on gpu   
1053         Slow Adam sparse updates in distributed TF   
677   model.fit generator multithreading is broken i...   
74    TF2.0 Multiple calls to Keras .fit and .evalua...   

                                                summary  \
1825   System information Have I written custom code...   
968   Running bazel bin tensorflow cc tutorials exam...   
1053  I am trying to train a model with the tf.nn.em...   
677    System information Have I written custom code...   