# Fine-Tuning and Evaluating RoBERTa base model for Binary Classification on a TensorFlow Augmented Dataset

# Import Necessary Libraries

In [20]:
!pip install -q transformers datasets scikit-learn huggingface_hub pandas nltk

# Import libraries
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import torch.nn.functional as F
import torch
import random
import nltk
nltk.download('wordnet')
nltk.download('punkt_tab')

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver d

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Define Synonym Replacement Functions

In [21]:
# Function to replace a random word in a sentence with its synonym
def synonym_replacement(sentence, n=1):
    if not sentence.strip():  # Handle empty sentences
        return sentence

    words = word_tokenize(sentence)
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_words)

# Function to get synonyms of a word
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name().lower() != word.lower():
                synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)




# Load and Combine Datasets

In [22]:
# Load all datasets (ARB, BOH, NAM, UNK have bugs; nonbug.csv does not have bugs)
datasets = ['ARB.csv', 'BOH.csv', 'NAM.csv', 'UNK.csv', 'nonbug.csv']
dfs = []

# Read and combine all datasets
for dataset in datasets:
    df = pd.read_csv(dataset)
    # Assign bug label based on the dataset filename
    label = 1 if dataset != 'nonbug.csv' else 0  # 1 for files with bugs, 0 for nonbug.csv
    df['label'] = label
    dfs.append(df)

# Combine all datasets into a single DataFrame
df_combined = pd.concat(dfs, ignore_index=True)

# Check for missing values and handle them
df_combined = df_combined.dropna(subset=['title', 'summary', 'comments'])

# Combine summary and comments into a single text column for the model input
df_combined['text'] = df_combined['summary'].astype(str) + " " + df_combined['comments'].astype(str)



# Split Dataset and Augment Training Data

In [23]:
# Split dataset into training (80%) and test (20%) sets
train_data, test_data = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['label'])

# Augment the training data by creating 5 variations for each row
augmented_train_data = []
for _, row in train_data.iterrows():
    augmented_train_data.append(row)
    for _ in range(5):  # Create 5 augmented versions
        augmented_text = synonym_replacement(row['text'])
        new_row = row.copy()
        new_row['text'] = augmented_text
        augmented_train_data.append(new_row)

# Convert the augmented data to a DataFrame
train_data_augmented = pd.DataFrame(augmented_train_data)

# Check dataset sizes
print(f"\nTraining Dataset Size: {len(train_data_augmented)}")
print(f"Testing Dataset Size: {len(test_data)}")

# Check the distribution of labels (1 and 0) in the training dataset
train_class_distribution = train_data_augmented['label'].value_counts()

print("\nClass Distribution in Training Dataset:")
print(train_class_distribution)



Training Dataset Size: 9432
Testing Dataset Size: 394

Class Distribution in Training Dataset:
label
0    4908
1    4524
Name: count, dtype: int64


# Tokenize Data

In [24]:
# Load tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize text and extract necessary fields
train_encodings = tokenizer(list(train_data_augmented['text']), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)

# Add labels to the tokenized data
train_encodings['labels'] = train_data_augmented['label'].values
test_encodings['labels'] = test_data['label'].values

# Convert to Hugging Face datasets
train_dataset = Dataset.from_dict(train_encodings)
test_dataset = Dataset.from_dict(test_encodings)

# Compute class weights to handle class imbalance
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=train_data_augmented['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Load the RoBERTa model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define Custom Dataset Class

In [25]:
# Custom Trainer Class with class weights handling
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        return (loss, outputs) if return_outputs else loss


# Define Metrics Function

In [26]:
# Custom metric function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predicted_labels = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predicted_labels)
    f1 = f1_score(labels, predicted_labels, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}



# Set Training Arguments

In [27]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none"
)



# Initialize and Train Model

In [28]:
# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights.to(training_args.device)
)

# Train the model
trainer.train()


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4909,0.926281,0.621827,0.6211
2,0.3608,1.737014,0.621827,0.621279
3,0.2087,2.244069,0.609137,0.608935


TrainOutput(global_step=3537, training_loss=0.37406487693414536, metrics={'train_runtime': 865.9014, 'train_samples_per_second': 32.678, 'train_steps_per_second': 4.085, 'total_flos': 1861247605616640.0, 'train_loss': 0.37406487693414536, 'epoch': 3.0})

# Evaluate Model

In [29]:

# Evaluate the model after training
def evaluate_model():
    predictions = trainer.predict(test_dataset)
    predicted_labels = predictions.predictions.argmax(axis=1)
    true_labels = predictions.label_ids

    # Print Accuracy and F1 score
    print(f"Accuracy: {accuracy_score(true_labels, predicted_labels) * 100:.2f}%")
    print(f"Classification Report:\n{classification_report(true_labels, predicted_labels)}")

# Evaluate the model
evaluate_model()


Accuracy: 62.18%
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.66      0.65       205
           1       0.61      0.58      0.59       189

    accuracy                           0.62       394
   macro avg       0.62      0.62      0.62       394
weighted avg       0.62      0.62      0.62       394

