In [1]:
!pip install pyarrow==15.0.2
import numpy as np


import torch

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split



In [2]:
!pip install datasets transformers



In [3]:
import pandas as pd
train_df = pd.read_csv("replaced_entities_train.csv")
test_df = pd.read_csv("test.csv")
print(train_df.head())

                                                text  label
0  ' of paying more than ps1bn in fines to insure...      0
1  ' dead sea shrinking by [ENTITY] [ENTITY] [ENT...      1
2  a blow to the head makes an instant hero in [E...      2
3  ' ethereum release was released on [ENTITY] [E...      0
4  government sets up co - ordination panel to sp...      2


In [4]:
target_map = {'gpt2_pytorch': 0,
 'gpt2_small': 1,
 'fair_wmt19': 2,
 'pplm_gpt2': 3,
 'gpt2_large': 4,
 'ctrl': 5,
 'xlm': 6,
 'gpt3': 7,
 'xlnet_base': 8,
 'transfo_xl': 9,
 'gpt2_xl': 10,
 'pplm_distil': 11,
 'gpt2_medium': 12,
 'grover_large': 13,
 'grover_mega': 14,
 'human': 15,
 'gpt1': 16,
 'grover_base': 17,
 'xlnet_large': 18,
 'fair_wmt20': 19}
test_target = target_map

In [5]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict
test_df['target'] = test_df['label'].map(test_target)



test_df = test_df[['Generation','target']]
test_df.columns = ['text','label']
train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)

In [6]:
target_map

{'gpt2_pytorch': 0,
 'gpt2_small': 1,
 'fair_wmt19': 2,
 'pplm_gpt2': 3,
 'gpt2_large': 4,
 'ctrl': 5,
 'xlm': 6,
 'gpt3': 7,
 'xlnet_base': 8,
 'transfo_xl': 9,
 'gpt2_xl': 10,
 'pplm_distil': 11,
 'gpt2_medium': 12,
 'grover_large': 13,
 'grover_mega': 14,
 'human': 15,
 'gpt1': 16,
 'grover_base': 17,
 'xlnet_large': 18,
 'fair_wmt20': 19}

In [7]:
dataset = DatasetDict({
    'train':train_dataset,
    'test':test_dataset
})

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 112204
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 37357
    })
})

In [9]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer

In [10]:
checkpoint = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_fn(batch):
  # Use the pre-defined tokenizer for consistency
  encoded_data = tokenizer(batch['text'], truncation=True, padding="max_length")
  return encoded_data

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/112204 [00:00<?, ? examples/s]

Map:   0%|          | 0/37357 [00:00<?, ? examples/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 20).to('cuda')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model.config.pad_token_id = tokenizer.eos_token_id
!pip install torchinfo

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [13]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                             Param #
GPT2ForSequenceClassification                      --
â”œâ”€GPT2Model: 1-1                                   --
â”‚    â””â”€Embedding: 2-1                              38,597,376
â”‚    â””â”€Embedding: 2-2                              786,432
â”‚    â””â”€Dropout: 2-3                                --
â”‚    â””â”€ModuleList: 2-4                             --
â”‚    â”‚    â””â”€GPT2Block: 3-1                         7,087,872
â”‚    â”‚    â””â”€GPT2Block: 3-2                         7,087,872
â”‚    â”‚    â””â”€GPT2Block: 3-3                         7,087,872
â”‚    â”‚    â””â”€GPT2Block: 3-4                         7,087,872
â”‚    â”‚    â””â”€GPT2Block: 3-5                         7,087,872
â”‚    â”‚    â””â”€GPT2Block: 3-6                         7,087,872
â”‚    â”‚    â””â”€GPT2Block: 3-7                         7,087,872
â”‚    â”‚    â””â”€GPT2Block: 3-8                         7,087,872
â”‚    â”‚    â””â”€GPT2

In [14]:
!pip install 'accelerate>=0.26.0'
!pip show accelerate
!pip install --upgrade transformers

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

                                  )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: accelerate
Version: 0.26.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.11/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)






In [15]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc = np.mean(predictions == labels)
  f1 = f1_score(labels, predictions, average = 'micro')
  return {'accuracy': acc, 'f1_score': f1}

In [16]:
trainer = Trainer(model,
                  training_args,
                  train_dataset = tokenized_dataset["train"],
                  eval_dataset = tokenized_dataset["test"],
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics
                  )

  trainer = Trainer(model,


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.5065,0.593324,0.747357,0.747357
2,0.4165,0.600108,0.757261,0.757261
3,0.4286,0.622504,0.775785,0.775785


TrainOutput(global_step=42078, training_loss=0.5384346218502132, metrics={'train_runtime': 20145.5845, 'train_samples_per_second': 16.709, 'train_steps_per_second': 2.089, 'total_flos': 1.7593983364497408e+17, 'train_loss': 0.5384346218502132, 'epoch': 3.0})