In [3]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification, GPT2ForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [4]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).
 
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf
 
        tf.random.set_seed(seed)
 
set_seed(1)

In [6]:
train_path = './train.csv'
train_data = pd.read_csv(train_path)

test_path = './test.csv'
test_data = pd.read_csv(test_path)

train_data.event_result = train_data.event_result.astype(str)
test_data.event_result = test_data.event_result.astype(str)

train_data = train_data[train_data['Primary code '] != 'EM']
train_data = train_data[train_data['Primary code '] != 'TD']

test_data = test_data[test_data['Primary code '] != 'EM']
test_data = test_data[test_data['Primary code '] != 'TD']


In [23]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
model_name = "gpt2"
# max sequence length for each document/sentence sample
max_length = 50

In [24]:
from sklearn import preprocessing

def read_data(test_size=0.2):
  train_texts = list(train_data.event_result)
  valid_texts = list(test_data.event_result)
  le = preprocessing.LabelEncoder()
  le.fit(train_data['Primary code '])
  train_labels = le.transform(train_data['Primary code '])
  valid_labels = le.transform(test_data['Primary code '])
  # split into training & testing a return data as well as label names
  return (train_texts, valid_texts, train_labels, valid_labels), le.classes_
  
# call the function
(train_texts, valid_texts, train_labels, valid_labels), target_names = read_data()

In [26]:
target_names

array(['CE', 'CM', 'CP', 'CRF', 'SESU', 'SMC', 'SN', 'SSI'], dtype=object)

In [28]:
# load the tokenizer
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained(model_name, do_lower_case=True)

tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_con

In [29]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
tokenizer.pad_token = tokenizer.eos_token

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [30]:
class CPS_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = CPS_dataset(train_encodings, train_labels)
valid_dataset = CPS_dataset(valid_encodings, valid_labels)

In [31]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  f1_micro = f1_score(labels, preds, average='micro')
  f1_macro = f1_score(labels, preds, average='macro')
  print(classification_report(labels, preds, labels= [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],  
                              target_names=target_names))
  return {
      'accuracy': acc,
      'f1-micro': f1_micro,
      'f1-macro': f1_macro
  }

In [33]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=500,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [35]:
# train the model
trainer.train()

***** Running training *****
  Num examples = 4942
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3090


Step,Training Loss,Validation Loss,Accuracy,F1-micro,F1-macro
500,1.796,1.279592,0.577453,0.577453,0.318653
1000,1.1638,0.99684,0.651257,0.651257,0.497745
1500,0.9029,0.924528,0.678021,0.678021,0.560081
2000,0.7988,0.897377,0.680454,0.680454,0.557383
2500,0.6761,0.949772,0.712895,0.712895,0.578217
3000,0.5627,0.973627,0.709651,0.709651,0.585273


***** Running Evaluation *****
  Num examples = 1233
  Batch size = 20


              precision    recall  f1-score   support

          CE       0.00      0.00      0.00        73
          CM       0.00      0.00      0.00        65
          CP       0.00      0.00      0.00         8
         CRF       0.56      0.05      0.10        94
        SESU       0.47      0.57      0.51       197
         SMC       0.61      0.63      0.62       187
          SN       0.57      0.67      0.62       151
         SSI       0.61      0.82      0.70       458

   micro avg       0.58      0.58      0.58      1233
   macro avg       0.28      0.27      0.25      1233
weighted avg       0.51      0.58      0.52      1233



  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1233
  Batch size = 20


              precision    recall  f1-score   support

          CE       0.40      0.45      0.43        73
          CM       0.53      0.43      0.47        65
          CP       0.00      0.00      0.00         8
         CRF       0.66      0.31      0.42        94
        SESU       0.60      0.63      0.62       197
         SMC       0.87      0.59      0.70       187
          SN       0.76      0.48      0.59       151
         SSI       0.65      0.88      0.75       458

   micro avg       0.65      0.65      0.65      1233
   macro avg       0.45      0.38      0.40      1233
weighted avg       0.66      0.65      0.64      1233



  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1233
  Batch size = 20


              precision    recall  f1-score   support

          CE       0.61      0.52      0.56        73
          CM       0.66      0.57      0.61        65
          CP       0.00      0.00      0.00         8
         CRF       0.42      0.57      0.48        94
        SESU       0.69      0.62      0.65       197
         SMC       0.84      0.65      0.73       187
          SN       0.70      0.70      0.70       151
         SSI       0.70      0.78      0.74       458

   micro avg       0.68      0.68      0.68      1233
   macro avg       0.46      0.44      0.45      1233
weighted avg       0.69      0.68      0.68      1233



  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1233
  Batch size = 20


              precision    recall  f1-score   support

          CE       0.54      0.60      0.57        73
          CM       0.52      0.58      0.55        65
          CP       0.00      0.00      0.00         8
         CRF       0.52      0.47      0.49        94
        SESU       0.65      0.66      0.66       197
         SMC       0.76      0.75      0.75       187
          SN       0.70      0.70      0.70       151
         SSI       0.74      0.74      0.74       458

   micro avg       0.68      0.68      0.68      1233
   macro avg       0.44      0.45      0.45      1233
weighted avg       0.68      0.68      0.68      1233



  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1233
  Batch size = 20


              precision    recall  f1-score   support

          CE       0.59      0.66      0.62        73
          CM       0.67      0.54      0.60        65
          CP       0.00      0.00      0.00         8
         CRF       0.52      0.39      0.45        94
        SESU       0.71      0.67      0.69       197
         SMC       0.81      0.73      0.77       187
          SN       0.72      0.72      0.72       151
         SSI       0.73      0.83      0.78       458

   micro avg       0.71      0.71      0.71      1233
   macro avg       0.48      0.45      0.46      1233
weighted avg       0.71      0.71      0.71      1233



  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1233
  Batch size = 20


              precision    recall  f1-score   support

          CE       0.59      0.63      0.61        73
          CM       0.66      0.57      0.61        65
          CP       0.00      0.00      0.00         8
         CRF       0.46      0.55      0.50        94
        SESU       0.75      0.64      0.69       197
         SMC       0.83      0.74      0.78       187
          SN       0.71      0.73      0.72       151
         SSI       0.74      0.80      0.77       458

   micro avg       0.71      0.71      0.71      1233
   macro avg       0.47      0.47      0.47      1233
weighted avg       0.71      0.71      0.71      1233



  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/config.json
Model weights saved in ./results/checkpoint-3000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-2000 (score: 0.8973768949508667).


TrainOutput(global_step=3090, training_loss=0.9719520902170719, metrics={'train_runtime': 294.4452, 'train_samples_per_second': 83.921, 'train_steps_per_second': 10.494, 'total_flos': 630565673472000.0, 'train_loss': 0.9719520902170719, 'epoch': 5.0})

In [36]:
# evaluate the current model after training
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1233
  Batch size = 20


              precision    recall  f1-score   support

          CE       0.54      0.60      0.57        73
          CM       0.52      0.58      0.55        65
          CP       0.00      0.00      0.00         8
         CRF       0.52      0.47      0.49        94
        SESU       0.65      0.66      0.66       197
         SMC       0.76      0.75      0.75       187
          SN       0.70      0.70      0.70       151
         SSI       0.74      0.74      0.74       458

   micro avg       0.68      0.68      0.68      1233
   macro avg       0.44      0.45      0.45      1233
weighted avg       0.68      0.68      0.68      1233



  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'epoch': 5.0,
 'eval_accuracy': 0.6804541768045418,
 'eval_f1-macro': 0.5573832394245433,
 'eval_f1-micro': 0.6804541768045418,
 'eval_loss': 0.8973768949508667,
 'eval_runtime': 2.5159,
 'eval_samples_per_second': 490.084,
 'eval_steps_per_second': 24.643}

In [None]:
# saving the fine tuned model & tokenizer
model_path = "./weights/gpt2"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in /content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/config.json
Model weights saved in /content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/pytorch_model.bin
tokenizer config file saved in /content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/tokenizer_config.json
Special tokens file saved in /content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/special_tokens_map.json


('/content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/tokenizer_config.json',
 '/content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/special_tokens_map.json',
 '/content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/vocab.json',
 '/content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/merges.txt',
 '/content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/added_tokens.json',
 '/content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/tokenizer.json')

In [None]:
# reload model/tokenizer
model = GPT2ForSequenceClassification.from_pretrained(model_path, num_labels=len(target_names))
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)

loading configuration file /content/drive/Shareddrives/La(LA)LAB/Projects/Mellon CPS/Automate CPS Classification/Amin & Nishitha/Amin/gpt2/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2ForSequenceClassification"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "resid

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]