# Model Controller Tutorial: GPT2 model (Regression)

> This notebook contains some example of how to use the GPT2-based models in this NLP library

- skip_showdoc: true
- skip_exec: true

In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check [this](https://anhquan0412.github.io/that-nlp-library/model_classification_tutorial.html) out

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
import os

In [None]:
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything

In [None]:
from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import AutoTokenizer
import datasets
from datasets import Dataset
import torch

# Define the custom augmentation function

In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

In [None]:
aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)

# Create a TextDataController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')


In [None]:
tdc = TextDataController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas=['Title','Division Name'],
                         cols_to_keep=['Title','Division Name','Review Text','Rating'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         # add "str.lower" here because nearby_aug might return uppercase character
                         val_ratio=0.2,
                         batch_size=1000,
                         seed=42,
                         num_proc=20,
                         verbose=False
                        )

Define our tokenizer for GPT2

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
_tokenizer.pad_token = _tokenizer.eos_token
_tokenizer.padding_side = 'left'

In [None]:
print(_tokenizer)
print(len(_tokenizer))

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)
50257


Process and tokenize our dataset

In [None]:
tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)

Map (num_proc=20):   0%|          | 0/18112 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/18112 [00:00<?, ? examples/s]

Flattening the indices (num_proc=20):   0%|          | 0/18112 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/18112 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
# rename the regression label column to `label`
tdc.main_ddict['train'] = tdc.main_ddict['train'].rename_column("Rating", "label")
tdc.main_ddict['validation'] = tdc.main_ddict['validation'].rename_column("Rating", "label")

In [None]:
# convert labels to float64
tdc.main_ddict['train']=tdc.main_ddict['train'].cast_column('label',datasets.Value("float64"))
tdc.main_ddict['validation']=tdc.main_ddict['validation'].cast_column('label',datasets.Value("float64"))

Casting the dataset:   0%|          | 0/18112 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'label', 'Division Name', 'input_ids', 'attention_mask'],
        num_rows: 18112
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'label', 'Division Name', 'input_ids', 'attention_mask'],
        num_rows: 4529
    })
})

In [None]:
tdc.main_ddict['train'].features

{'Title': Value(dtype='string', id=None),
 'Review Text': Value(dtype='string', id=None),
 'label': Value(dtype='float64', id=None),
 'Division Name': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
pd.Series(tdc.main_ddict['train']['label']).describe()

count    18112.000000
mean         4.178887
std          1.119555
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
dtype: float64

# Model Experiment: GPT2 Regression

## Define and train a vanilla GPT2 model

In [None]:
from transformers.models.gpt2.modeling_gpt2 import GPT2Model

In [None]:
from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import mean_absolute_error,mean_squared_log_error

comet_ml is installed but `COMET_API_KEY` is not set.


### Using HuggingFace model initialization

In [None]:
from transformers.models.gpt2.modeling_gpt2 import GPT2ForSequenceClassification

In [None]:
num_classes=1

In [None]:
seed_everything(42)
model = GPT2ForSequenceClassification.from_pretrained('gpt2',num_labels=num_classes)
model = model.to('cuda:0')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.pad_token_id = model.config.eos_token_id

In [None]:
model.resize_token_embeddings(len(_tokenizer))

Embedding(50257, 768)

### Using the GPT2Base model

In [None]:
gpt2body = GPT2Model.from_pretrained('gpt2')

In [None]:
num_classes=1

In [None]:
# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
}

model = model_init_classification(model_class = GPT2BaseForSequenceClassification,
                                  cpoint_path = 'gpt2', 
                                  output_hidden_states=False, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=gpt2body,
                                  model_kwargs = _model_kwargs)

Loading body weights. This assumes the body is the very first block of your custom architecture


In [None]:
# resize token embedding
model.body_model.resize_token_embeddings(len(_tokenizer))

Embedding(50257, 768)

### Using the GPT2Base model with a custom head to limit the output range

In [None]:
class SigmoidRange(torch.nn.Module):
    def __init__(self,
                 config,
                 high,
                 low,
                 **kwargs
                ):
        super().__init__()
        self.high=high
        self.low=low
        self.score = torch.nn.Linear(config.n_embd, config.num_labels, bias=False)
    def forward(self, inp, **kwargs):
        logits = self.score(inp)
        return torch.sigmoid(logits)*(self.high-self.low)+self.low

In [None]:
gpt2body = GPT2Model.from_pretrained('gpt2')

In [None]:
num_classes=1

In [None]:
# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'head_class':SigmoidRange,
    # classfication head hyperparams
    'high':5, # the maximum rating
    'low': 1, # the minimum rating
}

model = model_init_classification(model_class = GPT2BaseForSequenceClassification,
                                  cpoint_path = 'gpt2', 
                                  output_hidden_states=False, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=gpt2body,
                                  model_kwargs = _model_kwargs)

Loading body weights. This assumes the body is the very first block of your custom architecture


In [None]:
# resize token embedding
model.body_model.resize_token_embeddings(len(_tokenizer))

Embedding(50257, 768)

### Create ModelController and start training

In [None]:
metric_funcs = [mean_absolute_error,mean_squared_log_error]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

In [None]:
seed_everything(42)

In [None]:
lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               label_names='Rating',
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics_regression,
              )
# Epoch	Training Loss	Validation Loss	Mean Absolute Error Label	Mean Squared Log Error Label
# 1	No log	0.452389	0.533318	0.027305
# 2	0.976200	0.402699	0.445194	0.024518
# 3	0.976200	0.345889	0.409528	0.021801

Epoch,Training Loss,Validation Loss,Mean Absolute Error Label,Mean Squared Log Error Label
1,No log,0.388347,0.426821,0.02495
2,0.630500,0.347619,0.391528,0.022543
3,0.630500,0.321101,0.387717,0.020395


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/my_model')

## Predict validation

In [None]:
trained_model = model_init_classification(model_class = GPT2BaseForSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)

In [None]:
df_val = controller.predict_ddict_regression(ds_type='validation')

-------------------- Start making predictions --------------------


Map:   0%|          | 0/4529 [00:00<?, ? examples/s]

Map:   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
df_val = df_val.to_pandas()
df_val.head()

Unnamed: 0,Title,Review Text,label,Division Name,input_ids,attention_mask,pred
0,,general . . this picture doesn't do the skirt ...,5.0,general,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.867458
1,,"general . . easy to wear ! cute , comfy ... wi...",4.0,general,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.789146
2,,"general . . nice sweater , just did not look g...",3.0,general,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.90288
3,nice cropped jacket,general . nice cropped jacket . this jacket wa...,5.0,general,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.411712
4,great dress!,general petite . great dress ! . i wasn't plan...,5.0,general petite,"[24622, 4273, 578, 764, 1049, 6576, 5145, 764,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4.971798


You can try to get your metric to see if it matches your last traing epoch's above

In [None]:
mean_absolute_error(df_val['label'],df_val['pred'])

0.38774998065887845

In [None]:
mean_squared_log_error(df_val['label'],df_val['pred'])

0.020394463480786145

## Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let's reuse the sample csv and pretend it's our test set

In [None]:
df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)


# save the label, as we will calculate some metrics later
true_labels = df_test.Rating.values 

# drop the label (you don't need to, but this is necessary to simulate an actual test set)
df_test.drop('Rating',axis=1,inplace=True)

In [None]:
_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   is_regression=True,
                                                   do_filtering=True, # since we have some text filtering in the processing
                                                  )

ValueError: Test set does not have these columns required for preprocessings: {'Rating'}

In [None]:
set(controller.data_store.label_names)

TypeError: 'NoneType' object is not iterable

In [None]:
df_test_predicted = _test_dset_predicted.to_pandas()

In [None]:
df_test_predicted.head()

Unnamed: 0,Title,Review Text,Division Name,input_ids,attention_mask,pred_Department Name,pred_prob_Department Name
0,perfect for work and play,general . perfect for work and play . this shi...,general,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Tops,0.999515
1,,general petite . . i don't know why i had the ...,general petite,"[24622, 4273, 578, 764, 764, 1312, 836, 470, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Bottoms,0.990057
2,great pants,general petite . great pants . thes e cords ar...,general petite,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bottoms,0.996227
3,surprisingly comfy for a button down,general petite . surprisingly comfy for a butt...,general petite,"[24622, 4273, 578, 764, 12362, 401, 24928, 329...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",Tops,0.922326
4,short and small,general petite . short and small . the shirt i...,general petite,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Tops,0.998449


Let's quickly check the f1 score to make sure everything works correctly

In [None]:
f1_score(true_labels,df_test_predicted['pred_Department Name'],average='macro')

0.7524522904910365

Predict top k results

In [None]:
_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True,
                                                   topk=3
                                                  )

Filter (num_proc=20):   0%|          | 0/4692 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/4528 [00:00<?, ? examples/s]

-------------------- Start making predictions --------------------


Map:   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4528 [00:00<?, ? examples/s]

In [None]:
df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

Unnamed: 0,Title,Review Text,Division Name,input_ids,attention_mask,pred_Department Name,pred_prob_Department Name
0,perfect for work and play,general . perfect for work and play . this shi...,general,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Tops, Trend, Dresses]","[0.99951506, 0.00013868634, 0.00012767651]"
1,,general petite . . i don't know why i had the ...,general petite,"[24622, 4273, 578, 764, 764, 1312, 836, 470, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Bottoms, Intimate, Dresses]","[0.9900567, 0.004073128, 0.003412127]"
2,great pants,general petite . great pants . thes e cords ar...,general petite,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Bottoms, Intimate, Trend]","[0.99622667, 0.0036341597, 0.000104616636]"
3,surprisingly comfy for a button down,general petite . surprisingly comfy for a butt...,general petite,"[24622, 4273, 578, 764, 12362, 401, 24928, 329...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Tops, Dresses, Jackets]","[0.92232573, 0.067089744, 0.0046479926]"
4,short and small,general petite . short and small . the shirt i...,general petite,"[50256, 50256, 50256, 50256, 50256, 50256, 502...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Tops, Jackets, Trend]","[0.99844944, 0.0009410686, 0.0002554651]"


In [None]:
# Since we have some metadatas (Title and Division Name), we need to define a dictionary containing those values
raw_content={'Review Text': 'This shirt is so comfortable I love it!',
             'Title': 'Great shirt',
             'Division Name': 'general'}

In [None]:
controller.data_store.num_proc=1

In [None]:
df_result = controller.predict_raw_text(raw_content,topk=3)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

-------------------- Start making predictions --------------------


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
df_result

Unnamed: 0,Review Text,Title,Division Name,input_ids,attention_mask,pred_Department Name,pred_prob_Department Name
0,general . great shirt . this shirt is so comfo...,great shirt,general,"[24622, 764, 1049, 10147, 764, 428, 10147, 318...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[Tops, Jackets, Dresses]","[0.99976474, 8.1968574e-05, 5.9176444e-05]"
