## Import library and dataset

In [1]:
import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.metrics import precision_score
from datasets import Dataset
from transformers import TrainingArguments, Trainer, GPT2Tokenizer, GPT2ForSequenceClassification,\
    EarlyStoppingCallback
import pandas as pd

dataset = pd.read_csv(f"../dataset/preprocess/github-labels-top3-803k-10.0%.csv")
print(dataset.issue_label.value_counts())
print(dataset.info())

2025-11-15 12:52:00.605428: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-15 12:52:00.685824: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2025-11-15 12:52:01.969950: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


issue_label
0.0    40139
1.0    33258
2.0     6945
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80342 entries, 0 to 80341
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0.2              80342 non-null  int64  
 1   Unnamed: 0.1              80342 non-null  int64  
 2   Unnamed: 0                80342 non-null  int64  
 3   issue_url                 80342 non-null  object 
 4   issue_label               80342 non-null  float64
 5   issue_created_at          80342 non-null  object 
 6   issue_author_association  80342 non-null  object 
 7   repository_url            80342 non-null  object 
 8   issue_title               80342 non-null  object 
 9   issue_body                72354 non-null  object 
 10  text                      80342 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 6.7+ MB
None


## preprocess the dataset
### process the data

In [2]:
dataset = dataset.rename(columns={"issue_label": "label"})
# label required to be int64, otherwise will get error during training
dataset['label'] = dataset['label'].astype('int64')

pd.set_option('future.no_silent_downcasting', True)

### tokenize the data

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

encodings = tokenizer(
    dataset.text.values.tolist(),
    max_length = 128,
    truncation = True,
    padding="max_length",
    return_attention_mask=True,
    return_tensors='pt'
)

print(encodings)

# dataset = Dataset.from_pandas(df)
# print(dataset)
# def preprocess(batch):
#     return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
#
# dataset = dataset.map(preprocess, batched=True)
# dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

{'input_ids': tensor([[ 4550,  2010, 12042,  ..., 50256, 50256, 50256],
        [    1, 49601,   284,  ...,  1183,   307,  1049],
        [   32,   168,   232,  ..., 50256, 50256, 50256],
        ...,
        [24546, 39815,   287,  ...,   262, 39442,  2174],
        [16744,  8924,   352,  ..., 36638,    64,   374],
        [15307,  1353, 29056,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])}


### split the data

In [4]:
trainer_dataset = Dataset.from_dict({
    **encodings,
    "label": dataset.label.values
})

trainer_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(trainer_dataset)

# how to do stratify
split_data = trainer_dataset.train_test_split(test_size=0.15)
train_set = split_data['train']
test_set = split_data['test']

Dataset({
    features: ['input_ids', 'attention_mask', 'label'],
    num_rows: 80342
})


### run the training

In [5]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)
model.config.pad_token_id = tokenizer.pad_token_id

# Define training args
training_args = TrainingArguments(
    output_dir="../dataset",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    num_train_epochs=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions, average="weighted"),
        "recall": recall_score(labels, predictions, average="weighted"),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

# save model
# trainer.save_model("trained_electra_model")

# save tokenizer
# tokenizer.save_pretrained("")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.371,0.475362,0.829572,0.824646,0.829572,0.823354
2,0.3532,0.47188,0.830153,0.826097,0.830153,0.821494
3,0.3913,0.508258,0.829572,0.826741,0.829572,0.827924
4,0.34,0.638268,0.819034,0.815658,0.819034,0.816705
5,0.137,0.849724,0.8153,0.817364,0.8153,0.816096










TrainOutput(global_step=21345, training_loss=0.337524827800274, metrics={'train_runtime': 6167.6743, 'train_samples_per_second': 553.612, 'train_steps_per_second': 34.608, 'total_flos': 2.23051452678144e+16, 'train_loss': 0.337524827800274, 'epoch': 5.0})