# Model Controller Tutorial: Training a GPT2 Language Model

> This notebook contains an end-to-end process of preprocess + tokenizing your text, and build language models based on GPT architecture

- skip_showdoc: true
- skip_exec: true

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
import os

In [None]:
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main_lm import *
from that_nlp_library.utils import seed_everything
from that_nlp_library.model_lm_main import *
from that_nlp_library.utils import resize_model_embeddings

In [None]:
from underthesea import text_normalize
from functools import partial
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import DataCollatorForLanguageModeling
from tokenizers import processors

# Train a GPT2 Language Model From Scratch (with token concatenation)

This is the original way GPT2 is trained

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')

If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
_tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + _tokenizer.eos_token,
    special_tokens=[(_tokenizer.eos_token, _tokenizer.eos_token_id)],
)
_tokenizer.pad_token = _tokenizer.eos_token

In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=False,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12741
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 3235
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model from scratch

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path=None, # leave this as None to get a non-pretrained model
                             seed=42
                            )

Initiate a new language model from scratch
Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,4.692613,0.19875
2,5.485900,3.824931,0.277717
3,3.813100,3.77144,0.283966


Perplexity on validation set: 43.443


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_gpt_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_gpt_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
# reference for the keyword arguments: 
# https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/text_generation#transformers.GenerationMixin.generate

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


>>> major problem . this is by far one of the worst color you can't wait to wear it! it is a beautiful color. it is not comfortable, although the fit was just as thin as the model, as what is on. i am usually a 0 in this dress, and it fits a bit
>>> major problem . this is by far one of the worst dresses. i took a black in the store and have seen the red one. unfortunately, for more of a very flattering cut for my 5'8 " ) and it was great. the sweater in the back is very nice and a taupe
>>> major problem . this is by far one of the worst " i don't recommend it but it's not great to wear at all. as far as you are looking so i wear the black for a summer, dress it looks beautiful on. can't wait for a piece but the fabric is pretty good quality
--------------------
>>> flattering . this is by far one of the best piece. it's very comfortable and flattering. i think the colors are a nice cut. the design is gorgeous at the chest area. i don't buy this one. it's a perfect fall dress!!!! - i 

In [None]:
controller2.predict_raw_text(inp,print_result=False,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[[{'generated_text': 'major problem . this is by far one of the worst color and quality is quite cute - not for those more of a ". however, the sweater is fitted across your waist. it fits perfectly, and the slip buttons have mentioned, it\'s a little snug, but still not a little too much.'},
  {'generated_text': "major problem . this is by far one of the worst. love it and not worth it!!! i have a size 4 since so i'm so the sleeves were a bit large and then i'm glad i was a size 4. i'm curvy and the medium was pretty much too tight for"},
  {'generated_text': "major problem . this is by far one of the worst retailer is no way to keep it. i purchased the green and it is comfortable and well made. it is so soft and comfortable. the colors are nice, and the front fits better, and it's a very flattering look. the length was slightly"}],
 [{'generated_text': "flattering . this is by far one of the best thing is not worth the price but i'm a curvy girls and i could wear a cinch day, but so 

# Finetune GPT2 Language Model (with token concatenation)

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')

If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
_tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + _tokenizer.eos_token,
    special_tokens=[(_tokenizer.eos_token, _tokenizer.eos_token_id)],
)
_tokenizer.pad_token = _tokenizer.eos_token

In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=False,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12741
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 3235
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path='gpt2',
                             seed=42
                            )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,3.113104,0.35156
2,3.302400,2.938881,0.371071
3,2.884900,2.933563,0.371826


Perplexity on validation set: 18.794


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_gpt_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_gpt_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
# reference for the keyword arguments: 
# https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/text_generation#transformers.GenerationMixin.generate

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


>>> major problem . this is by far one of the worst top i have ever owned. the fit is terrible and it doesn't look right. the buttons go up at the crotch area which is a shame the material is very thin. the neckline is incredibly tight. another negative is the pockets area is super
>>> major problem . this is by far one of the worst blouses i've ever tried on. the blouse lays flat and looks horrible on the model. if you can't put a drape to this top it looks totally weird on someone. it seems like it's supposed to be on in the product
>>> major problem . this is by far one of the worst retailer bras ever i have tried! i wore to work and thought it would be very soft! it was the perfect length and the material was extremely thick. i am 5'4 ", but i am 5'4 " and usually wear a 6
--------------------
>>> flattering . this is by far one of the best blouses i have bought from retailer, the color is beautiful and is cut high enough to be flattering on the figure. the material is so soft and 

# Finetune GPT2 Language Model (with line-by-line concatenation)

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')
_tokenizer.pad_token = _tokenizer.eos_token

In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=True,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 18112
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 4529
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path='gpt2',
                             seed=42
                            )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.011946,0.249354
2,3.193000,2.878799,0.259648
3,3.193000,2.829699,0.263861
4,2.763700,2.824313,0.264406


Perplexity on validation set: 16.849


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_gpt_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_gpt_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


>>> major problem . this is by far one of the worst things i have ever wore. it fits weird and is very unflattering. at first i thought i was wearing a dress with flats but realized it wasn't! just a very unflattering tee on me. sadly, i returned it for a better fit
>>> major problem . this is by far one of the worst bottoms i've ever tried on. the top is very sheer. it looks nice tucked in the back but the bottoms on me are not. they are baggy in the shoulders. even if they fit in the back, i like that.
>>> major problem . this is by far one of the worst purchases my boyfriend made yet. i have had to return it. it is poorly made and will only go back. i need to have my baby sized down 2 sizes because the shoulder buttons are terrible. the quality is very average. if you are a
--------------------
>>> flattering . this is by far one of the best purchases i've made since i saw it online. the fabric, design, and fit is great. the only con is the material is fairly heavy and delicate for 