# Model Controller Tutorial: Training a GPT2 Language Model

> This notebook contains an end-to-end process of preprocess + tokenizing your text, and build language models based on GPT architecture

- skip_showdoc: true
- skip_exec: true

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
import os

In [None]:
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main_lm import *
from that_nlp_library.utils import seed_everything
from that_nlp_library.model_lm_main import *
from that_nlp_library.utils import resize_model_embeddings

In [None]:
from underthesea import text_normalize
from functools import partial
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import DataCollatorForLanguageModeling
from tokenizers import processors

comet_ml is installed but `COMET_API_KEY` is not set.


# Train a GPT2 Language Model From Scratch (with token concatenation)

This is the original way GPT2 is trained

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')

If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
_tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + _tokenizer.eos_token,
    special_tokens=[(_tokenizer.eos_token, _tokenizer.eos_token_id)],
)
_tokenizer.pad_token = _tokenizer.eos_token

In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=False,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12741
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 3235
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model from scratch

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path=None, # leave this as None to get a non-pretrained model
                             seed=42
                            )

Initiate a new language model from scratch
Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,4.697953,0.200376
2,5.551800,4.097337,0.24722
2,5.551800,3.833579,0.275745
3,3.821300,3.779644,0.282777


Perplexity on validation set: 43.800


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
# reference for the keyword arguments: 
# https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/text_generation#transformers.GenerationMixin.generate

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


>>> major problem . this is by far one of the worst of its shape. i ordered the tuck in the store and loved it but it is gorgeous and so beautiful - the fabric falls nicely. the material is flattering and the sleeves are comfortable. the color and fit is true to size, like a dress
>>> major problem . this is by far one of the worst in the picture. the colors are so cute i ordered a size medium, but went back for a size 6. i like the top. it was a little too big at least fit. i will wear a size 2 top, but in the shoulder
>>> major problem . this is by far one of the worst color one of those of the blue blue. the design is soft and warm - very soft! it's the perfect summer piece. very pretty, too. i am 5'4 ", but i am 5'4 p / and it fits perfectly
--------------------
>>> flattering . this is by far one of the best! i am 5'9 ", 34 c bust and 34 c. it fits just beautifully. no stretchy fabric is very good quality but not like i expected and the skirt hit above my waist. i do have to wear 

In [None]:
controller2.predict_raw_text(inp,print_result=False,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[[{'generated_text': "major problem . this is by far one of the worst i got it in the store yesterday and i have it for my wardrobe! my husband can't wear this dress if it had been a nude color? but i wanted to love this and the cut ( i'm short arms ). you could dress it"},
  {'generated_text': 'major problem . this is by far one of the worst color. highly recommend. perfect piece for summer or spring and for fall days. but i can wear it to a tunic underneath. not to make it a flattering shirt, but this runs a little large. i am 5\'10 " and usually'},
  {'generated_text': 'major problem . this is by far one of the worst size small to be loose on me. the material is very boxy and the cut of the dress, which i will get to go back. the cut of this to a little more than the picture, which makes you feel special on.. i'}],
 [{'generated_text': 'flattering . this is by far one of the best purchase that is that the slip is light and looks great with a cami underneath. super cute all. this top

# Finetune GPT2 Language Model (with token concatenation)

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')

If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
_tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + _tokenizer.eos_token,
    special_tokens=[(_tokenizer.eos_token, _tokenizer.eos_token_id)],
)
_tokenizer.pad_token = _tokenizer.eos_token

In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=False,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12741
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 3235
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path='gpt2',
                             seed=42
                            )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,3.113538,0.351026
2,3.303700,2.983198,0.365869
2,3.303700,2.939725,0.370865
3,2.886300,2.934201,0.37126


Perplexity on validation set: 18.806


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
# reference for the keyword arguments: 
# https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/text_generation#transformers.GenerationMixin.generate

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


>>> major problem . this is by far one of the worst top i have ever owned. the fit is terrible and it doesn't look as good as on line. i am 5'4 " with a fairly slim upper body.. that doesn't help. i got three other items but am sending the rest to
>>> major problem . this is by far one of the worst blouses i've ever shopped at retailer. the cutout design looks horrible on the model. if you can't tell from the other reviews, the front looks totally ripped on me. it seems like it's supposed to be just in front of
>>> major problem . this is by far one of the worst retailer bras ever. i got this in purple and felt so badly that it had made me blush. the color of the purple was very pretty, but then the purple looked awful on me from sitting on my chair. i have to go back... there
--------------------
>>> flattering . this is by far one of the best shorts i own in years. it's super soft and lays really nicely. it does have a bit of weight to it though, which i like because it tends to have

# Finetune GPT2 Language Model (with line-by-line concatenation)

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')
_tokenizer.pad_token = _tokenizer.eos_token

In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=True,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 18112
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 4529
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path='gpt2',
                             seed=42
                            )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.01341,0.24856
2,3.194200,2.878801,0.259095
3,3.194200,2.830053,0.263616
4,2.764700,2.824914,0.264219


Perplexity on validation set: 16.859


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


>>> major problem . this is by far one of the worst things i have ever wore. it fits weird and is basically a small dress at the shoulders. i ordered xs and it was way too big. i could carry on wearing the dress but then the waist would be too big on me. would have
>>> major problem . this is by far one of the worst bottoms i've ever used in a long time. i was worried it would be too short, but it sits way too high across the back. still the material is great, i like color and design. also, i can barely see my bra
>>> major problem . this is by far one of the worst purchases my boyfriend made yet. i have had to return it. it is poorly made and it was poorly made. there are also too many of the metallic threads which are extremely long, so i do not realize how much quality this item has. for
--------------------
>>> flattering . this is by far one of the best purchases i've made in a very long while. the fabric, design, and fit is great. the only con is the material is fairly heavy and 