In [1]:
import numpy as np
import seaborn as sns

import torch

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
!pip install datasets transformers

Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Using cached pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64.whl (40.1 MB)
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 11.0.0
    Uninstalling pyarrow-11.0.0:
      Successfully uninstalled pyarrow-11.0.0
Successfully installed pyarrow-18.1.0


In [3]:
import pandas as pd
train_df = pd.read_csv("adjectives_removed.csv")
test_df = pd.read_csv("test.csv")
print(train_df.head(15))

                                                 text  label
0   ' of paying than ps1bn in fines to insurers ac...      0
1   sea shrinking by 1 meter every year the expres...      1
2   a blow to the head makes an hero in head , nat...      2
3   ' ethereum release was released on 18th , 2014...      0
4   government sets up co-ordination panel to spee...      2
5   politician killed in beirut blast the express ...      3
6   'gulzar singh ranike the expresshindustan time...      4
7   ' kids stuck in home for days with fatally sho...      5
8   'bmc pulls out 29 firemen families from staff ...      6
9   ' , continue reading .... 16 ) religare scheme...      7
10  'new norms : cos to seek clarity from sharma t...      6
11  idbi gilt fund debt information : scheme objec...      5
12  'appointments in railway minister kharge 's pr...      8
13  'robert redford compares trump to nixon in wat...      9
14  bonnie burstow , psychotherapist who rejected ...      5


In [4]:
target_map = {'gpt2_pytorch': 0,
 'gpt2_small': 1,
 'fair_wmt19': 2,
 'pplm_gpt2': 3,
 'gpt2_large': 4,
 'ctrl': 5,
 'xlm': 6,
 'gpt3': 7,
 'xlnet_base': 8,
 'transfo_xl': 9,
 'gpt2_xl': 10,
 'pplm_distil': 11,
 'gpt2_medium': 12,
 'grover_large': 13,
 'grover_mega': 14,
 'human': 15,
 'gpt1': 16,
 'grover_base': 17,
 'xlnet_large': 18,
 'fair_wmt20': 19}


In [5]:
!pip install pyarrow==11.0.0
from datasets import load_dataset
from datasets import Dataset, DatasetDict


test_df['target'] = test_df['label'].map(target_map)
print(test_df.head())

test_df = test_df[['Generation','target']]
test_df.columns = ['text','label']

train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)

Collecting pyarrow==11.0.0
  Using cached pyarrow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Using cached pyarrow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.1.0 requires pyarrow>=15.0.0, but you have pyarrow 11.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed pyarrow-11.0.0
                                          Generation        label  target
0  'tim tebow says high concussion rate in footba...          xlm       6
1  'sene man held for links with underworld don t...  xlnet_large      18
2  100 years ago, the booz

In [6]:
test_dataset.select(range(5))['label']

[6, 18, 19, 16, 15]

In [7]:
train_dataset = train_dataset.map(lambda x: {'label': int(x['label'])})

Map:   0%|          | 0/112204 [00:00<?, ? examples/s]

In [8]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 112204
})

In [9]:
dataset = DatasetDict({
    'train':train_dataset,
    'test':test_dataset
})

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 112204
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 37357
    })
})

In [11]:
from transformers import AutoTokenizer
checkpoint = 'openai-community/gpt2' #base model


In [12]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_fn(batch):
  # Use the pre-defined tokenizer for consistency
  encoded_data = tokenizer(batch['text'], truncation=True, padding="max_length")
  return encoded_data

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/112204 [00:00<?, ? examples/s]

Map:   0%|          | 0/37357 [00:00<?, ? examples/s]

In [13]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

In [14]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 20).to('cuda')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model.config.pad_token_id = tokenizer.eos_token_id
!pip install torchinfo

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                             Param #
GPT2ForSequenceClassification                      --
├─GPT2Model: 1-1                                   --
│    └─Embedding: 2-1                              38,597,376
│    └─Embedding: 2-2                              786,432
│    └─Dropout: 2-3                                --
│    └─ModuleList: 2-4                             --
│    │    └─GPT2Block: 3-1                         7,087,872
│    │    └─GPT2Block: 3-2                         7,087,872
│    │    └─GPT2Block: 3-3                         7,087,872
│    │    └─GPT2Block: 3-4                         7,087,872
│    │    └─GPT2Block: 3-5                         7,087,872
│    │    └─GPT2Block: 3-6                         7,087,872
│    │    └─GPT2Block: 3-7                         7,087,872
│    │    └─GPT2Block: 3-8                         7,087,872
│    │    └─GPT2Block: 3-9                         7,087,872
│    │    └─GPT2Block: 3-10                        7,08

In [18]:

!pip install accelerate==0.28.0

training_args = TrainingArguments(
    
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

                                  )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)






In [19]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc = np.mean(predictions == labels)
  f1 = f1_score(labels, predictions, average = 'micro')
  return {'accuracy': acc, 'f1_score': f1}

In [20]:
trainer = Trainer(model,
                  training_args,
                  train_dataset = tokenized_dataset["train"],
                  eval_dataset = tokenized_dataset["test"],
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics
                  )

  trainer = Trainer(model,


In [21]:
import numpy as np

unique_labels = np.unique(test_dataset['label'])

print(unique_labels)
import numpy as np

print(np.isnan(test_dataset['label']).sum())  # This will show how many NaN values are in the 'label' column



[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
0


In [22]:


trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.4657,0.721859,0.720347,0.720347
2,0.5241,0.591409,0.773322,0.773322
3,0.2832,0.591609,0.788795,0.788795


TrainOutput(global_step=42078, training_loss=0.5273394390194864, metrics={'train_runtime': 21766.4215, 'train_samples_per_second': 15.465, 'train_steps_per_second': 1.933, 'total_flos': 1.7593983364497408e+17, 'train_loss': 0.5273394390194864, 'epoch': 3.0})

In [23]:
torch.cuda.memory_reserved()

17521704960

In [24]:
torch.cuda.memory_allocated()

1524545536