https://github.com/Revanthraja/-GPT2-For-Text-Classification-using-Hugging-Face-Transformers

In [1]:
!pip install transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp /content/drive/MyDrive/TFM-MUECIM/*.py /content
!cp /content/drive/MyDrive/TFM-MUECIM/*.txt /content
!cp /content/drive/MyDrive/TFM-MUECIM/*.dat /content
!cp /content/drive/MyDrive/TFM-MUECIM/*.tar /content
!cd /content; tar xf data.tar data
!cd /content/drive/MyDrive/TFM-MUECIM


In [4]:
import numpy as np
import os
import shutil
import sys
import torch
from datetime import datetime
from huggingface_hub import login
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from tfm_50LabelsLabelIndex import LabelIndex
from tfm_EURLEX57KDataset import EURLEX57KDataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader
from transformers import GPT2Config, GPT2ForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EvalPrediction
from tfm_50LabelsGPT2DataFrameBuilder import _50LabelsGPT2DataFrameBuilder
from tfm_ModelTrainPipeline import ModelTrainPipeline

In [5]:
baseDir = '/content/drive/My Drive/TFM-MUECIM'
sys.path.append(baseDir)

In [6]:
with open('./HF_token.txt') as fd:
    lines = fd.readlines()

hf_token = lines[0].strip()
login(hf_token)

In [7]:
baseDir = '.'
labelsIndexFile = '50LabelsLabelSetFileIndex.txt'
dataFrameFile =  '50LabelsGPT2Dataset.csv'


In [8]:
_50LabelsGPT2DataFrameBuilder(baseDir,
                              labelsIndexFile,
                              dataFrameFile)

Loading "GPT2" tokenizer.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Building reduced data frame
Index: 0
Index: 1000
Index: 2000
Index: 3000
Index: 4000
Index: 5000
Index: 6000
Index: 7000
Index: 8000
Index: 9000
Index: 10000
Index: 11000
Index: 12000
Index: 13000
Index: 14000
Index: 15000
Index: 16000
Index: 17000
Index: 18000
Index: 19000
Index: 20000
Index: 21000
Index: 22000
Index: 23000
Index: 24000
Index: 25000
Index: 26000
Index: 27000
Index: 28000
Index: 29000
Init CSV saving
50 Labels GPT2 CSV DataFrame Created


<tfm_50LabelsGPT2DataFrameBuilder._50LabelsGPT2DataFrameBuilder at 0x7a76afa36e10>

In [None]:
def countNonZeroItems(items):
    nonZero = torch.nonzero(items, as_tuple= True)
    return len(nonZero[0])

In [None]:
def multi_label_metrics(predictions, labels, ):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_true = labels
    y_pred[np.where(probs >= 0.5)] = 1
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)

    metrics = {
        'f1': f1_micro_average,
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }

    return metrics

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)

    return result

In [None]:
def metricsForTestSet(trainer, testData):
    predictions = trainer.predict(testData)
    preds = predictions.predictions[0] if isinstance(predictions.predictions, tuple) else predictions.predictions
    labels = predictions.label_ids
    testMetrics = multi_label_metrics(predictions=preds, labels=labels)

    print(testMetrics)

In [None]:
def trainingPipeline(model, epochNum, trainer):
    # training pipeline

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # Move the model to the correct device before training.
    model.to(device)

    # baseline
    print(f'Begin epoch {epochNum} train session - trainer evaluate')
    trainer.evaluate()

    print('Metrics for test set (before train)')
    metricsForTestSet(trainer, testData)

    # training
    print('Epoch train.')

    trainer.train()
    print('Epoch train done.')

    # evaluate epoch training
    print('End epoch train session - trainer evaluate')
    trainer.evaluate()

    print('Metrics for test set (after train)')
    metricsForTestSet(trainer, testData)

    print('End epoch train session')

In [None]:
def saveEpochTraining(model, epochNum, fileName, modelName):
    # save epoch training
    folderGDrive = '/content/drive/MyDrive/TFM-MUECIM'
    prefixDate = datetime.today().strftime('%Y%m%d')
    fileName = f'{prefixDate}_50L_{modelName}_epoch_{epochNum}.pt'
    modelFullPath = os.path.join(baseDir,fileName)
    destFullPath = os.path.join(folderGDrive,fileName)

    print(f'Saving model "{modelName}" after epoch {epochNum} train session')
    print(f'file name: {fileName}')
    print(f'full path: {modelFullPath}')
    print(f'full path (copy): {destFullPath}')
    torch.save(model, modelFullPath)
    shutil.copyfile(modelFullPath, destFullPath)
    print('Done!')

    return fileName

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(0)
ds = EURLEX57KDataset(baseDir='.', DataFrameFile=dataFrameFile)
fullSetSize = ds.__len__()
trainSetSize = int(fullSetSize * 0.8)
valSetSize = int(fullSetSize * 0.1)
testSetSize = fullSetSize - trainSetSize - valSetSize
trainData, valData, testData = random_split(
    ds, [trainSetSize, valSetSize, testSetSize])

batchSize = 10

trainDataLoader = DataLoader(trainData, batch_size=batchSize, shuffle=True)
valDataLoader = DataLoader(valData, batch_size=batchSize, shuffle=True)
testDataLoader = DataLoader(testData, batch_size=batchSize, shuffle=True)

labelIndex = LabelIndex(baseDir)

cache_dir = os.path.join(baseDir, 'tfm_cache')
os.makedirs(cache_dir, exist_ok=True)

modelName = 'gpt2'
config = GPT2Config.from_pretrained(
    modelName,
    force_download=True,
    cache_dir=cache_dir,
    num_labels=labelIndex.numLabels,
    problem_type='multi_label_classification',
    id2label=labelIndex.id2label,
    label2id=labelIndex.label2id
)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
epochNum = 1
fileName = ''
metricName = 'f1'
learning_rate=2e-5
weight_decay=0.01

In [None]:
while epochNum <= 12:
  if epochNum == 1:
      model = GPT2ForSequenceClassification.from_pretrained(
          modelName,
          config=config,
          cache_dir=cache_dir)
  else:
      model = torch.load(os.path.join(baseDir, fileName), map_location=torch.device(device), weights_only=False)

  model.config.pad_token_id = model.config.eos_token_id

  trainArgs = TrainingArguments(
    'tfm_oputput',
    report_to = 'none',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate = learning_rate,
    per_device_train_batch_size = batchSize,
    per_device_eval_batch_size = batchSize,
    num_train_epochs = 1,
    weight_decay = weight_decay,
    load_best_model_at_end = True,
    metric_for_best_model=metricName
  )

  trainer = Trainer(
    model=model,
    args=trainArgs,
    train_dataset=trainData,
    eval_dataset=valData,
    compute_metrics = compute_metrics
  )

  trainingPipeline(model, epochNum, trainer)
  fileName = saveEpochTraining(model, epochNum, fileName, modelName)
  epochNum += 1


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Begin epoch 1 train session - trainer evaluate


Metrics for test set (before train)
{'f1': 0.049801351317154643, 'roc_auc': np.float64(0.4947826622698466), 'accuracy': 0.0}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0558,0.047938,0.0021,0.703459,0.795587,0.480385


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.7014448669201521, 'roc_auc': np.float64(0.7954986381251202), 'accuracy': 0.4814305364511692}
End epoch train session
Saving model "gpt2" after epoch 1 train session
file name: 20250322_50L_gpt2_epoch_1.pt
full path: ./20250322_50L_gpt2_epoch_1.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_1.pt
Done!
Begin epoch 2 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.7014448669201521, 'roc_auc': np.float64(0.7954986381251202), 'accuracy': 0.4814305364511692}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0371,0.035869,0.0021,0.784796,0.862519,0.593944


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.7941379787086963, 'roc_auc': np.float64(0.8680366143472322), 'accuracy': 0.6076341127922971}
End epoch train session
Saving model "gpt2" after epoch 2 train session
file name: 20250322_50L_gpt2_epoch_2.pt
full path: ./20250322_50L_gpt2_epoch_2.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_2.pt
Done!
Begin epoch 3 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.7941379787086963, 'roc_auc': np.float64(0.8680366143472322), 'accuracy': 0.6076341127922971}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.029,0.032825,0.0026,0.804564,0.882867,0.625258


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.8146956288549209, 'roc_auc': np.float64(0.8891978792891184), 'accuracy': 0.6433975240715268}
End epoch train session
Saving model "gpt2" after epoch 3 train session
file name: 20250322_50L_gpt2_epoch_3.pt
full path: ./20250322_50L_gpt2_epoch_3.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_3.pt
Done!
Begin epoch 4 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.8146956288549209, 'roc_auc': np.float64(0.8891978792891184), 'accuracy': 0.6433975240715268}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.022,0.032458,0.0022,0.811069,0.895743,0.63627


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.8233905860757834, 'roc_auc': np.float64(0.9020919499366126), 'accuracy': 0.6568088033012379}
End epoch train session
Saving model "gpt2" after epoch 4 train session
file name: 20250322_50L_gpt2_epoch_4.pt
full path: ./20250322_50L_gpt2_epoch_4.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_4.pt
Done!
Begin epoch 5 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.8233905860757834, 'roc_auc': np.float64(0.9020919499366126), 'accuracy': 0.6568088033012379}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0154,0.033444,0.0021,0.816108,0.903951,0.642808


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.8229974160206718, 'roc_auc': np.float64(0.9076446913282036), 'accuracy': 0.655433287482806}
End epoch train session
Saving model "gpt2" after epoch 5 train session
file name: 20250322_50L_gpt2_epoch_5.pt
full path: ./20250322_50L_gpt2_epoch_5.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_5.pt
Done!
Begin epoch 6 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.8229974160206718, 'roc_auc': np.float64(0.9076446913282036), 'accuracy': 0.655433287482806}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0099,0.035156,0.0021,0.815789,0.90662,0.64212


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.824967824967825, 'roc_auc': np.float64(0.9101840249277727), 'accuracy': 0.6568088033012379}
End epoch train session
Saving model "gpt2" after epoch 6 train session
file name: 20250322_50L_gpt2_epoch_6.pt
full path: ./20250322_50L_gpt2_epoch_6.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_6.pt
Done!
Begin epoch 7 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.824967824967825, 'roc_auc': np.float64(0.9101840249277727), 'accuracy': 0.6568088033012379}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0061,0.03727,0.0021,0.816044,0.909187,0.643496


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.8234692573181643, 'roc_auc': np.float64(0.9121130298949941), 'accuracy': 0.6523383768913342}
End epoch train session
Saving model "gpt2" after epoch 7 train session
file name: 20250322_50L_gpt2_epoch_7.pt
full path: ./20250322_50L_gpt2_epoch_7.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_7.pt
Done!
Begin epoch 8 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.8234692573181643, 'roc_auc': np.float64(0.9121130298949941), 'accuracy': 0.6523383768913342}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0038,0.039405,0.0022,0.814509,0.910473,0.638679


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.824878919194494, 'roc_auc': np.float64(0.9140157635099738), 'accuracy': 0.6540577716643742}
End epoch train session
Saving model "gpt2" after epoch 8 train session
file name: 20250322_50L_gpt2_epoch_8.pt
full path: ./20250322_50L_gpt2_epoch_8.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_8.pt
Done!
Begin epoch 9 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.824878919194494, 'roc_auc': np.float64(0.9140157635099738), 'accuracy': 0.6540577716643742}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0025,0.041705,0.0037,0.815471,0.911968,0.645217


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.8231413346866278, 'roc_auc': np.float64(0.9149467008843388), 'accuracy': 0.656121045392022}
End epoch train session
Saving model "gpt2" after epoch 9 train session
file name: 20250322_50L_gpt2_epoch_9.pt
full path: ./20250322_50L_gpt2_epoch_9.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_9.pt
Done!
Begin epoch 10 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.8231413346866278, 'roc_auc': np.float64(0.9149467008843388), 'accuracy': 0.656121045392022}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0017,0.043651,0.0021,0.814197,0.912046,0.638679


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.8224844562872732, 'roc_auc': np.float64(0.9145534347515235), 'accuracy': 0.6537138927097662}
End epoch train session
Saving model "gpt2" after epoch 10 train session
file name: 20250322_50L_gpt2_epoch_10.pt
full path: ./20250322_50L_gpt2_epoch_10.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250322_50L_gpt2_epoch_10.pt
Done!
Begin epoch 11 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.8224844562872732, 'roc_auc': np.float64(0.9145534347515235), 'accuracy': 0.6537138927097662}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0011,0.045097,0.0021,0.816215,0.913699,0.64797


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.8283572608308982, 'roc_auc': np.float64(0.9171018286927938), 'accuracy': 0.6595598349381018}
End epoch train session
Saving model "gpt2" after epoch 11 train session
file name: 20250323_50L_gpt2_epoch_11.pt
full path: ./20250323_50L_gpt2_epoch_11.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250323_50L_gpt2_epoch_11.pt
Done!
Begin epoch 12 train session - trainer evaluate




Metrics for test set (before train)
{'f1': 0.8283572608308982, 'roc_auc': np.float64(0.9171018286927938), 'accuracy': 0.6595598349381018}
Epoch train.


Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Roc Auc,Accuracy
1,0.0011,0.047885,0.0021,0.817594,0.914722,0.645905


Epoch train done.
End epoch train session - trainer evaluate


Metrics for test set (after train)
{'f1': 0.8260814410757326, 'roc_auc': np.float64(0.9165303649237829), 'accuracy': 0.6571526822558459}
End epoch train session
Saving model "gpt2" after epoch 12 train session
file name: 20250323_50L_gpt2_epoch_12.pt
full path: ./20250323_50L_gpt2_epoch_12.pt
full path (copy): /content/drive/MyDrive/TFM-MUECIM/20250323_50L_gpt2_epoch_12.pt
Done!
