## Fine-Tuning and Evaluating Llama3 with LoRA for Binary Classification on a TensorFlow Augmented Dataset

 # Install required libraries

In [1]:
!pip install -q transformers accelerate datasets evaluate scikit-learn huggingface_hub pandas
!pip install -U bitsandbytes
!pip install peft
!pip install nltk

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.

# Import necessary libraries

In [2]:

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from huggingface_hub import login
import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import torch.nn.functional as F
import random
import nltk
from nltk.corpus import wordnet

# Ensure NLTK WordNet is downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# Hugging Face authentication

In [3]:

login(token="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")


# Load and Preprocess Dataset

In [4]:
# File paths for datasets
file_paths = {
    "ARB": "ARB.csv",
    "BOH": "BOH.csv",
    "NAM": "NAM.csv",
    "UNK": "UNK.csv",
    "nonbug": "nonbug.csv"
}

# Load datasets
dfs = {file: pd.read_csv(path) for file, path in file_paths.items()}

# Add labels: 1 for bug-related datasets and 0 for non-bug dataset
for file in ["ARB", "BOH", "NAM", "UNK"]:
    dfs[file]['label'] = 1  # Label for bug datasets
dfs["nonbug"]['label'] = 0  # Label for non-bug dataset



# Combine and Preprocess Data

In [5]:
# Combine all datasets into one
combined_data = pd.concat([dfs[file] for file in dfs], ignore_index=True)

# Create 'combined_text' column using 'title', 'summary', and 'comments'
combined_data['combined_text'] = (
    combined_data['title'].fillna('') + " " +
    combined_data['summary'].fillna('') + " " +
    combined_data['comments'].fillna('')
)

# Handle missing values in the 'label' column (if any)
combined_data = combined_data.dropna(subset=['label'])


# Split Dataset into Train and Test Sets

In [6]:
# Split into training and testing sets (80% training, 20% testing)
train_df, test_df = train_test_split(
    combined_data,
    test_size=0.2,
    stratify=combined_data['label'],
    random_state=42
)

# Check original training and testing dataset sizes
print(f"Original Training Set Size: {len(train_df)}")
print(f"Original Testing Set Size: {len(test_df)}")


Original Training Set Size: 1589
Original Testing Set Size: 398


# Data Augmentation with Synonym Replacement

In [7]:
# Synonym Replacement Function for Data Augmentation
def synonym_replacement(sentence, n=1):
    words = sentence.split()

    # Skip empty sentences
    if not words:
        return sentence

    new_words = words.copy()
    words_to_replace = list(filter(lambda w: wordnet.synsets(w), words))  # Only choose words with synonyms

    if len(words_to_replace) == 0:
        return sentence  # Return the original sentence if no words have synonyms

    for _ in range(n):
        word_to_replace = random.choice(words_to_replace)  # Choose a word with synonyms
        synonyms = wordnet.synsets(word_to_replace)

        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym != word_to_replace:
                # Find all occurrences of the word and replace
                new_words = [synonym if w == word_to_replace else w for w in new_words]

                # Optional: Remove replaced word from `words_to_replace` to avoid multiple replacements
                words_to_replace.remove(word_to_replace)

                # Break if no more replaceable words remain
                if not words_to_replace:
                    break

    return ' '.join(new_words)


# Generate Augmented Data

In [8]:
# Augmenting training data with 5 variations for each row
augmented_data = []

# Loop through each row in the original training set
for _, row in train_df.iterrows():
    original_text = row['combined_text']
    label = row['label']
    # Generate 5 augmented variations for each row
    for _ in range(5):
        augmented_text = synonym_replacement(original_text, n=random.randint(1, 3))
        augmented_data.append({'combined_text': augmented_text, 'label': label})

# Combine original and augmented datasets
augmented_df = pd.DataFrame(augmented_data)
train_df = pd.concat([train_df, augmented_df]).reset_index(drop=True)

# Verify the augmented training set size
print(f"Original Training Set Size: 1589")
print(f"Augmented Training Set Size: {len(augmented_data)}")  # Should be 1589 * 5 = 7945
print(f"Final Training Set Size: {len(train_df)}")  # Should be 1589 original + 7945 augmented = 9534


Original Training Set Size: 1589
Augmented Training Set Size: 7945
Final Training Set Size: 9534


# Tokenization

In [9]:
# Tokenize the text
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the pad token for the tokenizer and model
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Tokenize combined_text
train_df['tokenized'] = train_df['combined_text'].apply(lambda x: tokenizer(x, truncation=True, max_length=128))
train_df['input_ids'] = train_df['tokenized'].apply(lambda x: x['input_ids'])
train_df['attention_mask'] = train_df['tokenized'].apply(lambda x: x['attention_mask'])

test_df['tokenized'] = test_df['combined_text'].apply(lambda x: tokenizer(x, truncation=True, max_length=128))
test_df['input_ids'] = test_df['tokenized'].apply(lambda x: x['input_ids'])
test_df['attention_mask'] = test_df['tokenized'].apply(lambda x: x['attention_mask'])

# Drop unnecessary columns
train_df = train_df.drop(columns=['tokenized'])
test_df = test_df.drop(columns=['tokenized'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

# Dataset Preparation

In [10]:
# Create Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'label']])

# Rename the 'label' column to 'labels' in both train and test datasets
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


# Define Class Weights

In [11]:
# Define class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(sorted(combined_data['label'].unique())),
    y=combined_data['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)


# Load and Configure Base Model

In [12]:
# Load the base LLaMA model for sequence classification
num_classes = 2
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
    device_map="auto"
)


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Configure and Attach LoRA

In [13]:
# Configure PEFT LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

# Attach LoRA adapters to the model
model_with_lora = get_peft_model(base_model, lora_config)


# Apply Quantization



In [14]:
# Apply quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = model_with_lora.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set padding token ID in model config
model.config.pad_token_id = tokenizer.pad_token_id

# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



# Define Custom Trainer

In [15]:
# Define a custom trainer
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        return (loss, outputs) if return_outputs else loss


# Set Training Arguments

In [16]:
# Training arguments
training_args = TrainingArguments(
    output_dir="classification_output",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)




 # Initialize and Train Model

In [17]:
# Initialize trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    class_weights=class_weights.to(training_args.device)
)

# Train the model
trainer.train()


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.2554,0.946895
2,0.0023,2.168687
3,0.0004,2.415461


TrainOutput(global_step=3576, training_loss=0.22432902710309074, metrics={'train_runtime': 4131.407, 'train_samples_per_second': 6.923, 'train_steps_per_second': 0.866, 'total_flos': 2.1451492882907136e+16, 'train_loss': 0.22432902710309074, 'epoch': 3.0})

# Evaluate Model

In [18]:
# Evaluate function
def evaluate_model(test_df, model):
    sentences = test_df['combined_text'].tolist()
    labels = test_df['label'].tolist()

    batch_size = 32
    all_outputs = []
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs.logits)

    final_outputs = torch.cat(all_outputs, dim=0)
    test_df['predictions'] = final_outputs.argmax(axis=1).cpu().numpy()

    print("Classification Report:")
    print(classification_report(labels, test_df['predictions']))

    print("Balanced Accuracy:", balanced_accuracy_score(labels, test_df['predictions']))
    print("Accuracy:", accuracy_score(labels, test_df['predictions']))

# Evaluate the model
evaluate_model(test_df, model)


Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.61      0.63       207
           1       0.60      0.63      0.62       191

    accuracy                           0.62       398
   macro avg       0.62      0.62      0.62       398
weighted avg       0.62      0.62      0.62       398

Balanced Accuracy: 0.6211017527885272
Accuracy: 0.6206030150753769
