In [32]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import numpy as np
from tqdm import tqdm
import time
from sklearn.metrics import precision_recall_fscore_support, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split

In [33]:
LABEL_NAMES = ["bug", "enhancement", "question"]
NUM_LABELS = 3
id2label = {0: "bug", 1: "enhancement", 2: "question"}
label2id = {"bug": 0, "enhancement": 1, "question": 2}
batch_size = 4
num_epochs = 4

In [34]:
# load data from preprocess dataset
data_set = pd.read_csv('../dataset/preprocess/github-labels-top3-803k-0.1%.csv')

# Split the data
train_set, test_set = train_test_split(data_set, test_size=0.15, random_state=42, stratify=data_set['issue_label'])

print(train_set.issue_label.value_counts())
print(f"Training set size (85% of 1%): {len(train_set)}")
print(f"Test set size (15% of 1%): {len(test_set)}")

issue_label
0.0    341
1.0    283
2.0     59
Name: count, dtype: int64
Training set size (85% of 1%): 683
Test set size (15% of 1%): 121


In [35]:
# tokenize `text` data using `BertTokenizer`
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
encoded_data_train = tokenizer.batch_encode_plus(
    train_set.text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    truncation=True,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_set['issue_label'].map(label2id).values, dtype=torch.long)

encoded_data_test = tokenizer.batch_encode_plus(
    test_set.text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    truncation=True,
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test_set['issue_label'].map(label2id).values, dtype=torch.long)

In [36]:
print(type(input_ids_train),input_ids_train.shape,input_ids_train.dtype)
print(type(attention_masks_test),attention_masks_test.shape,attention_masks_test.dtype)
print(type(labels_train),labels_train.shape,labels_train.dtype)
print(input_ids_train[:3])
print(attention_masks_train[:3])
print(labels_train[:3])
print(input_ids_test[:3])
print(attention_masks_test[:3])
print(labels_test[:3])

<class 'torch.Tensor'> torch.Size([683, 512]) torch.int64
<class 'torch.Tensor'> torch.Size([121, 512]) torch.int64
<class 'torch.Tensor'> torch.Size([683]) torch.int64
tensor([[  101,  2663,  4471,  ...,     0,     0,     0],
        [  101, 18247,  4638,  ...,     0,     0,     0],
        [  101,  5587, 28516,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([0, 0, 0])
tensor([[  101,  2070, 13109,  ...,     0,     0,     0],
        [  101,  1996, 15882,  ...,     0,     0,     0],
        [  101,  5587,  2000,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([0, 0, 0])


In [37]:
#Create DataLoader for training/test
# Create TensorDataset
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
# Create DataLoader with random sampling
dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size,
)

dataloader_test = DataLoader(
    dataset_test,
    sampler=RandomSampler(dataset_test),
    batch_size=batch_size,
)

# Display dataset info
for i, batch in enumerate(dataloader_train):
    print(f"batch {i}: {[tensor.shape for tensor in batch]}")
    if i >= 2:  
        break

batch 0: [torch.Size([4, 512]), torch.Size([4, 512]), torch.Size([4])]
batch 1: [torch.Size([4, 512]), torch.Size([4, 512]), torch.Size([4])]
batch 2: [torch.Size([4, 512]), torch.Size([4, 512]), torch.Size([4])]


In [38]:
# set the Bert model
model = BertForSequenceClassification.from_pretrained(
        "google-bert/bert-base-uncased",
        num_labels=NUM_LABELS,
        output_attentions=False,
        output_hidden_states=False,
    )
    
# Set label mappings
model.config.id2label = {0: "bug", 1: "enhancement", 2: "question"}
model.config.label2id = {"bug": 0, "enhancement": 1, "question": 2}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Create AdamW optimizer

In [39]:
# Calculate total training steps
total_steps = len(dataloader_train) * num_epochs

optimizer = AdamW(
    model.parameters(),
    lr=1e-5,    # Learning rate
    eps=1e-8,   # Epsilon to prevent division by zero
)

# Create linear scheduler with warmup
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,           # No warmup
    num_training_steps=total_steps # Total training steps
)

print(f"Total training steps: {total_steps}")

Total training steps: 684


In [40]:
def train_epoch(model, dataloader, optimizer, scheduler, epoch_num):
    print(f"\n Starting Epoch {epoch_num} Training...")
    
    model.train()
    total_loss = 0
    
    # Progress bar
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch_num}")
    
    for batch in progress_bar:
        # Clear gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(
            input_ids=batch[0],
            attention_mask=batch[1],
            labels=batch[2]
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters
        optimizer.step()
        scheduler.step()
        
        # Update progress bar with loss only
        progress_bar.set_postfix({
            'batch_loss': f'{loss.item():.4f}',
            'avg_loss': f'{total_loss/(progress_bar.n+1):.4f}'
        })
    
    avg_train_loss = total_loss / len(dataloader)
    
    print(f" Epoch {epoch_num} Training Completed")
    print(f" Average Training Loss: {avg_train_loss:.4f}")
    
    return avg_train_loss

In [41]:
def complete_training(model, train_dataloader, optimizer, scheduler, num_epochs):
    print("Starting Complete Training...")
    train_losses = []
    
    for epoch in range(num_epochs):
        print(f"\n=== Epoch {epoch+1}/{num_epochs} ===")
        
        # Training only
        train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, epoch+1)
        train_losses.append(train_loss)
    
    print(f"\nTraining Completed! Total epochs: {num_epochs}")
    print(f"  Final Training Loss: {train_losses[-1]:.4f}")
    
    return train_losses

Evaluate model and compute micro-averaging metrics

In [42]:
def evaluate_model(model, dataloader, label_names=None):
    model.eval()  # Set model to evaluation mode
    
    all_predictions = []
    all_labels = []
    
    print(" Starting model evaluation...")
    
    with torch.no_grad():  # Disable gradient computation
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Unpack batch data
            input_ids = batch[0]
            attention_mask = batch[1]
            labels = batch[2]
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # Get predictions
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            
            # Collect predictions and true labels
            all_predictions.extend(predictions.numpy())
            all_labels.extend(labels.numpy())
    
    # Convert to numpy arrays
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    
    # Compute micro-averaging metrics
    micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(
        all_labels, all_predictions, average='micro', zero_division=0
    )
    
    # Compute per-class metrics
    per_class_precision, per_class_recall, per_class_f1, support = precision_recall_fscore_support(
        all_labels, all_predictions, average=None, zero_division=0
    )
    
    # Set default label names
    if label_names is None:
        label_names = [f'Label_{i}' for i in range(len(per_class_precision))]
    
    return {
        'predictions': all_predictions,
        'labels': all_labels,
        'micro_metrics': {
            'precision': micro_precision,
            'recall': micro_recall,
            'f1': micro_f1
        },
        'per_class_metrics': {
            'precision': per_class_precision,
            'recall': per_class_recall,
            'f1': per_class_f1,
            'support': support
        },
        'label_names': label_names
    }

In [43]:
def print_detailed_metrics(metrics, epoch_num=None, set_name="Test"):
    if epoch_num is not None:
        print(f"\n{'='*60}")
        print(f" Epoch {epoch_num} - {set_name} Set Detailed Metrics")
        print(f"{'='*60}")
    else:
        print(f"\n{'='*50}")
        print(f" {set_name} Set Detailed Metrics")
        print(f"{'='*50}")
    
    micro_metrics = metrics['micro_metrics']
    per_class_metrics = metrics['per_class_metrics']
    label_names = metrics['label_names']
    
    # Print per-class metrics
    print("\n Per-Class Metrics:")
    print("-" * 50)
    print(f"{'Label':<15} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    print("-" * 50)
    
    for i, label_name in enumerate(label_names):
        print(f"{label_name:<15} {per_class_metrics['precision'][i]:<10.4f} "
              f"{per_class_metrics['recall'][i]:<10.4f} "
              f"{per_class_metrics['f1'][i]:<10.4f} "
              f"{per_class_metrics['support'][i]:<10}")
    
    # Print global micro-averaging metrics
    print("-" * 50)
    print(f"{'MICRO-AVG':<15} {micro_metrics['precision']:<10.4f} "
          f"{micro_metrics['recall']:<10.4f} "
          f"{micro_metrics['f1']:<10.4f} "
          f"{np.sum(per_class_metrics['support']):<10}")
    
    # Print summary
    print(f"\n Global Micro-Averaging Metrics:")
    print(f"   • Precision: {micro_metrics['precision']:.4f}")
    print(f"   • Recall:    {micro_metrics['recall']:.4f}")
    print(f"   • F1-Score:  {micro_metrics['f1']:.4f}")


In [44]:
# Complete training 
print("=== TRAINING PHASE ===")
train_losses = complete_training(
    model=model,
    train_dataloader=dataloader_train,
    optimizer=optimizer,
    scheduler=scheduler,
    num_epochs=num_epochs
)

# Test after training
print("\n Starting Final Test...")
metrics = evaluate_model(model, dataloader_test, LABEL_NAMES)
print_detailed_metrics(metrics, set_name="Final Test")

model.save_pretrained('./trained_bert_model')
print("\n Model saved to './trained_bert_model'")

=== TRAINING PHASE ===
Starting Complete Training...

=== Epoch 1/4 ===

 Starting Epoch 1 Training...


Epoch 1:   0%|          | 0/171 [00:00<?, ?it/s]


KeyboardInterrupt: 