# Model Controller Tutorial: GPT2 model (Custom Single Head)

> This notebook contains some example of how to use the GPT2-based models in this NLP library

- skip_showdoc: true
- skip_exec: true

In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check [this](https://anhquan0412.github.io/that-nlp-library/model_classification_tutorial.html) out

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
import os

In [None]:
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything

In [None]:
from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import AutoTokenizer
from datasets import Dataset

# Define the custom augmentation function

In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

In [None]:
aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)

# Create a TextDataController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')


In [None]:
tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         # add "str.lower" here because nearby_aug might return uppercase character
                         val_ratio=0.2,
                         batch_size=1000,
                         seed=42,
                         num_proc=20,
                         verbose=False
                        )

Define our tokenizer for GPT2

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
_tokenizer.pad_token = _tokenizer.eos_token
_tokenizer.padding_side = 'left'

In [None]:
print(_tokenizer)
print(len(_tokenizer))

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)
50257


Process and tokenize our dataset

In [None]:
tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)

Map (num_proc=20):   0%|          | 0/18102 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/18102 [00:00<?, ? examples/s]

Flattening the indices (num_proc=20):   0%|          | 0/18102 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/18102 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4526 [00:00<?, ? examples/s]

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18102
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4526
    })
})

# Model Experiment: GPT2 Single-Head Classification

## Define and train a vanilla GPT2 model

In [None]:
from transformers.models.gpt2.modeling_gpt2 import GPT2Model

In [None]:
from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import f1_score, accuracy_score

comet_ml is installed but `COMET_API_KEY` is not set.


### Using HuggingFace model initialization

In [None]:
from transformers.models.gpt2.modeling_gpt2 import GPT2ForSequenceClassification

In [None]:
num_classes = len(tdc.label_lists[0])
num_classes

6

In [None]:
seed_everything(42)
model = GPT2ForSequenceClassification.from_pretrained('gpt2',num_labels=num_classes)
model = model.to('cuda:0')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.pad_token_id = model.config.eos_token_id

In [None]:
model.resize_token_embeddings(len(_tokenizer))

Embedding(50257, 768)

In [None]:
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

In [None]:
lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1 Score Department name,Accuracy Score Department name
1,No log,0.370936,0.725267,0.889085
2,0.744100,0.326659,0.727912,0.891516
3,0.744100,0.316274,0.731764,0.896376


### Using the GPT2Base model (designed for not only single-head but multi-head, multi-label ...)

In [None]:
gpt2body = GPT2Model.from_pretrained('gpt2')

In [None]:
# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    # classfication head hyperparams
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = GPT2BaseForSequenceClassification,
                                  cpoint_path = 'gpt2', 
                                  output_hidden_states=False, # since we are not using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=gpt2body,
                                  model_kwargs = _model_kwargs)

Loading body weights. This assumes the body is the very first block of your custom architecture


In [None]:
# resize token embedding
model.body_model.resize_token_embeddings(len(_tokenizer))

Embedding(50257, 768)

 Create ModelController and start training

In [None]:
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

In [None]:
lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )



Epoch,Training Loss,Validation Loss,F1 Score Department name,Accuracy Score Department name
1,No log,0.302667,0.744811,0.915157
2,0.735900,0.272006,0.748732,0.917366
3,0.735900,0.265745,0.74939,0.919134


## Make predictions

In [None]:
df_val = controller.predict_ddict(ds_type='validation')

-------------------- Start making predictions --------------------


Map:   0%|          | 0/4526 [00:00<?, ? examples/s]

Map:   0%|          | 0/4526 [00:00<?, ? examples/s]

In [None]:
df_val = df_val.to_pandas()
df_val.head()

Unnamed: 0,Title,Review Text,Division Name,Department Name,label,input_ids,attention_mask,pred_Department Name,pred_prob_Department Name
0,,general petite . . such a fun jacket ! great t...,general petite,Intimate,2,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Jackets,0.894356
1,simple and elegant,general petite . simple and elegant . i though...,general petite,Tops,4,"[24622, 4273, 578, 764, 2829, 290, 19992, 764,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Tops,0.999345
2,retro and pretty,general . retro and pretty . this top has a bi...,general,Tops,4,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Tops,0.999754
3,summer/fall wear,general petite . summer / fall wear . i first ...,general petite,Dresses,1,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...",Dresses,0.962247
4,perfect except slip,general petite . perfect except slip . this is...,general petite,Dresses,1,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Dresses,0.993872


You can try to get your metric to see if it matches your last traing epoch's above

In [None]:
f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')

0.7494959154180147

# Model Experiment: GPT2 Single-Head Classification (with hidden layer concatenation)

## Define and train a custom GPT2 model

In [None]:
from transformers.models.gpt2.modeling_gpt2 import GPT2Model

In [None]:
from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import f1_score, accuracy_score

comet_ml is installed but `COMET_API_KEY` is not set.


In [None]:
num_classes = len(tdc.label_lists[0])
num_classes

6

In [None]:
gpt2body = GPT2Model.from_pretrained('gpt2')

Then we can define a classification head. One trick we can use to boost the performance of our entire model is to concatenate the outputs of the last tokens from the four last layers of the pre-trained Roberta model (an improvised approach from this source: https://ieeexplore.ieee.org/document/9335912). We already define such custom head (`ConcatHeadSimple`), and the necessary architecture to make it work (`GPT2HiddenStateConcatForSequenceClassification`)

In [None]:
# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'head_class': ConcatHeadSimple,
    # classfication head hyperparams
    'layer2concat':2, # you can change the number of layers to concat (default is 4, based on the paper)
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'gpt2', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=gpt2body,
                                  model_kwargs = _model_kwargs)

Loading body weights. This assumes the body is the very first block of your custom architecture


In [None]:
# resize token embedding
model.body_model.resize_token_embeddings(len(_tokenizer))

Embedding(50257, 768)

In [None]:
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

In [None]:
seed_everything(42)

In [None]:
lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )

# Epoch	Training Loss	Validation Loss	F1 Score Department name	Accuracy Score Department name
# 1	No log	0.301476	0.746599	0.914494
# 2	0.400300	0.263080	0.749670	0.920901

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1 Score Department name,Accuracy Score Department name
1,No log,0.417906,0.73355,0.909191
2,0.843300,0.312693,0.733184,0.904772
3,0.843300,0.291497,0.741076,0.913389


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/my_model')

## Make predictions

### Load trained model

In [None]:
_model_kwargs

{'head_class_sizes': 6,
 'head_class': that_nlp_library.models.roberta.classifiers.ConcatHeadSimple,
 'layer2concat': 2,
 'classifier_dropout': 0.1}

In [None]:
trained_model = model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)

### Predict Train/Validation set

In [None]:
df_val = controller.predict_ddict(ds_type='validation')

-------------------- Start making predictions --------------------


Map:   0%|          | 0/4526 [00:00<?, ? examples/s]

Map:   0%|          | 0/4526 [00:00<?, ? examples/s]

In [None]:
df_val = df_val.to_pandas()
df_val.head()

Unnamed: 0,Title,Review Text,Division Name,Department Name,label,input_ids,attention_mask,pred_Department Name,pred_prob_Department Name
0,,general petite . . such a fun jacket ! great t...,general petite,Intimate,2,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Jackets,0.953084
1,simple and elegant,general petite . simple and elegant . i though...,general petite,Tops,4,"[24622, 4273, 578, 764, 2829, 290, 19992, 764,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Tops,0.994079
2,retro and pretty,general . retro and pretty . this top has a bi...,general,Tops,4,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Tops,0.999968
3,summer/fall wear,general petite . summer / fall wear . i first ...,general petite,Dresses,1,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...",Dresses,0.819742
4,perfect except slip,general petite . perfect except slip . this is...,general petite,Dresses,1,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Dresses,0.997397


You can try to get your metric to see if it matches your last traing epoch's above

In [None]:
f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')

0.7410761794742524

### Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let's reuse the sample csv and pretend it's our test set

In [None]:
df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
# drop NaN values in the label column
df_test = df_test[~df_test['Department Name'].isna()].reset_index(drop=True)

# save the label, as we will calculate some metrics later. We also filter out labels with NaN Review Text,
# as there will be a filtering processing on the test set
true_labels = df_test.loc[~df_test['Review Text'].isna(),'Department Name'].values 

# drop the label (you don't need to, but this is necessary to simulate an actual test set)
df_test.drop('Department Name',axis=1,inplace=True)

In [None]:
_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True, # since we have some text filtering in the processing
                                                  )

Filter (num_proc=20):   0%|          | 0/4692 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

-------------------- Start making predictions --------------------


Map:   0%|          | 0/4528 [00:00<?, ? examples/s]

Map:   0%|          | 0/4528 [00:00<?, ? examples/s]

In [None]:
df_test_predicted = _test_dset_predicted.to_pandas()

In [None]:
df_test_predicted.head()

Unnamed: 0,Title,Review Text,Division Name,input_ids,attention_mask,pred_Department Name,pred_prob_Department Name
0,perfect for work and play,general . perfect for work and play . this shi...,general,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Tops,0.999918
1,,general petite . . i don't know why i had the ...,general petite,"[24622, 4273, 578, 764, 764, 1312, 836, 470, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Bottoms,0.992391
2,great pants,general petite . great pants . thes e cords ar...,general petite,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bottoms,0.998326
3,surprisingly comfy for a button down,general petite . surprisingly comfy for a butt...,general petite,"[24622, 4273, 578, 764, 12362, 401, 24928, 329...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Tops,0.944762
4,short and small,general petite . short and small . the shirt i...,general petite,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Tops,0.999826


Let's quickly check the f1 score to make sure everything works correctly

In [None]:
f1_score(true_labels,df_test_predicted['pred_Department Name'],average='macro')

0.7507166641686256

Predict top k results

In [None]:
_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True,
                                                   topk=3
                                                  )

Filter (num_proc=20):   0%|          | 0/4692 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

-------------------- Start making predictions --------------------


Map:   0%|          | 0/4528 [00:00<?, ? examples/s]

Map:   0%|          | 0/4528 [00:00<?, ? examples/s]

In [None]:
df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

Unnamed: 0,Title,Review Text,Division Name,input_ids,attention_mask,pred_Department Name,pred_prob_Department Name
0,perfect for work and play,general . perfect for work and play . this shi...,general,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Tops, Trend, Dresses]","[0.9999181, 3.341722e-05, 2.6476195e-05]"
1,,general petite . . i don't know why i had the ...,general petite,"[24622, 4273, 578, 764, 764, 1312, 836, 470, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Bottoms, Intimate, Dresses]","[0.99239117, 0.0037502914, 0.0020853707]"
2,great pants,general petite . great pants . thes e cords ar...,general petite,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Bottoms, Intimate, Trend]","[0.99832577, 0.0016587807, 1.4020866e-05]"
3,surprisingly comfy for a button down,general petite . surprisingly comfy for a butt...,general petite,"[24622, 4273, 578, 764, 12362, 401, 24928, 329...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Tops, Dresses, Intimate]","[0.944762, 0.038514145, 0.006670953]"
4,short and small,general petite . short and small . the shirt i...,general petite,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Tops, Jackets, Trend]","[0.9998259, 6.142132e-05, 5.0150942e-05]"


In [None]:
# Since we have some metadatas (Title and Division Name), we need to define a dictionary containing those values
raw_content={'Review Text': 'This shirt is so comfortable I love it!',
             'Title': 'Great shirt',
             'Division Name': 'general'}

In [None]:
controller.data_store.num_proc=1

In [None]:
df_result = controller.predict_raw_text(raw_content,topk=3)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

-------------------- Start making predictions --------------------


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
df_result

Unnamed: 0,Review Text,Title,Division Name,input_ids,attention_mask,pred_Department Name,pred_prob_Department Name
0,general . great shirt . this shirt is so comfo...,great shirt,general,"[24622, 764, 1049, 10147, 764, 428, 10147, 318...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[Tops, Dresses, Trend]","[0.9969104, 0.001380199, 0.0009443495]"
