## import libraries and dataset

In [7]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

dataset = pd.read_csv(f"../dataset/preprocess/github-labels-top3-803k-100.0%.csv")
print(dataset.issue_label.value_counts())
print(dataset.info())

issue_label
0.0    402
1.0    333
2.0     69
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0.2              804 non-null    int64  
 1   Unnamed: 0.1              804 non-null    int64  
 2   Unnamed: 0                804 non-null    int64  
 3   issue_url                 804 non-null    object 
 4   issue_label               804 non-null    float64
 5   issue_created_at          804 non-null    object 
 6   issue_author_association  804 non-null    object 
 7   repository_url            804 non-null    object 
 8   issue_title               804 non-null    object 
 9   issue_body                725 non-null    object 
 10  text                      804 non-null    object 
dtypes: float64(1), int64(3), object(7)
memory usage: 69.2+ KB
None


## preprocess the dataset

### process the dataset

In [8]:
dataset = dataset.rename(columns={"issue_label": "label"})
# label required to be int64, otherwise will get error during training
dataset['label'] = dataset['label'].astype('int64')

# pd.set_option('future.no_silent_downcasting', True)

### tokenize the data



In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

encodings = tokenizer(
    dataset.text.values.tolist(),
    max_length = 128,
    truncation = True,
    padding="max_length",
    return_attention_mask=True,
    return_token_type_ids=True,
    return_tensors='pt'
)

print(encodings)

{'input_ids': tensor([[  101,  1031,  3160,  ...,     0,     0,     0],
        [  101,  5587,  5216,  ...,     0,     0,     0],
        [  101,  1052,  4213,  ...,     0,     0,     0],
        ...,
        [  101,  4773, 29378,  ...,  1063,  1000,   102],
        [  101,  5022,  2951,  ...,  2497,  1011,   102],
        [  101,  8241,  3711,  ...,  2575,  2581,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}


### split the data

In [10]:
trainer_dataset = Dataset.from_dict({
    **encodings,
    "label": dataset.label.values
})

trainer_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(trainer_dataset)

# how to do stratify
split_data = trainer_dataset.train_test_split(test_size=0.15)
train_set = split_data['train']
test_set = split_data['test']

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 804
})


## run the training

In [11]:
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=3)

# Define training args
training_args = TrainingArguments(
    output_dir="../dataset",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    num_train_epochs=4,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",  # Disable WandB unless needed
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions, average="weighted"),
        "recall": recall_score(labels, predictions, average="weighted"),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7302,0.752661,0.710744,0.672465,0.710744,0.673219
2,0.4679,0.839046,0.702479,0.662015,0.702479,0.667687
3,0.2946,0.834579,0.752066,0.759216,0.752066,0.751797
4,0.2025,0.853266,0.77686,0.776626,0.77686,0.776682


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


TrainOutput(global_step=344, training_loss=0.5268996603093868, metrics={'train_runtime': 67.0045, 'train_samples_per_second': 40.773, 'train_steps_per_second': 5.134, 'total_flos': 179706464308224.0, 'train_loss': 0.5268996603093868, 'epoch': 4.0})