# Model Controller Tutorial: Training a GPT2 Language Model

> This notebook contains an end-to-end process of preprocess + tokenizing your text, and build language models based on GPT architecture

- skip_showdoc: true
- skip_exec: true

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
import os

In [None]:
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main_lm import *
from that_nlp_library.utils import seed_everything
from that_nlp_library.model_lm_main import *
from that_nlp_library.utils import resize_model_embeddings

In [None]:
from underthesea import text_normalize
from functools import partial
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import DataCollatorForLanguageModeling
from tokenizers import processors

# Train a GPT2 Language Model From Scratch (with token concatenation)

This is the original way GPT2 is trained

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')



If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
_tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + _tokenizer.eos_token,
    special_tokens=[(_tokenizer.eos_token, _tokenizer.eos_token_id)],
)
_tokenizer.pad_token = _tokenizer.eos_token

In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=False,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12741
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 3235
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model from scratch

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path=None, # leave this as None to get a non-pretrained model
                             seed=42
                            )

Initiate a new language model from scratch
Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 3
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,4.586693,0.204113
2,5.402300,3.972607,0.262826


Perplexity on validation set: 53.123


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_gpt_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_gpt_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
# reference for the keyword arguments: 
# https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/text_generation#transformers.GenerationMixin.generate

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

>>> major problem . this is by far one of the worst color around or the chest detail, i did not want it to the side. it looks great on me. it's pretty short - definitely be too casual in a size i think it is a little higher length on me. it runs small, but
>>> major problem . this is by far one of the worst and will be very flattering. i'm 5'7 " and 120 lbs. it was too small in the regular size 4. this top was huge all the material, so i have looked more like the picture. the fit is the right length and
>>> major problem . this is by far one of the worst of the top. but the blue is a very low quality and the bottom is very unflattering. overall, the material is beautiful and the cut is a little roomy than i have to pull it in the top, but i did. i am
--------------------
>>> flattering . this is by far one of the best. i went to buy it at first but it was huge and to be going back. i am 5'3 " 140 # and purchased a 6. that it goes with a keeper. i am very thin and it fits perfectly. in 

In [None]:
controller2.predict_raw_text(inp,print_result=False,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

[[{'generated_text': 'major problem . this is by far one of the worst is very lightweight and made of the sleeves do not be great to wear. the fit is perfect for the spring. the length part has a bit to be. the top of the blue is very pretty but i am in blue. i got the x'},
  {'generated_text': "major problem . this is by far one of the worst and the dress was super high-colored green. great shirt that you can wear with a medium bra and it is very flattering. it's soft i had to have to wear it all fall or winter. i ordered the 4 but i love this top"},
  {'generated_text': "major problem . this is by far one of the worst, which is the photo. it is very pretty and i found that it arrived with the top. i took me to return it off. the dress is not a casual style, and the fabric looks as a bit shorter though, it's really pretty"}],
 [{'generated_text': "flattering . this is by far one of the best retailer ( so disappointing ). i'm wearing a fan of a medium, but with a large side but i am ve

# Finetune GPT2 Language Model (with token concatenation)

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')



If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
_tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + _tokenizer.eos_token,
    special_tokens=[(_tokenizer.eos_token, _tokenizer.eos_token_id)],
)
_tokenizer.pad_token = _tokenizer.eos_token

In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=False,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12741
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 3235
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path='gpt2',
                             seed=42
                            )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 3
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,3.088598,0.353749
2,3.269600,2.959936,0.369057


Perplexity on validation set: 19.297


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_gpt_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_gpt_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
# reference for the keyword arguments: 
# https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/text_generation#transformers.GenerationMixin.generate

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

>>> major problem . this is by far one of the worst shorts i've seen in a long time. i want it to work, but what about the other pants. they seem to be washed out and need to be rewashed twice????. i don't have a pair of these yet,
>>> major problem . this is by far one of the worst pants i have ever owned. i'm 5'7 " 105 lbs, 34 b with long arms and very short legs. i don't have large hips, so the " top " looked a bit wider. at first i was surprised it was so
>>> major problem . this is by far one of the worst romper i have ever purchased. the cut is low, and the bottom hem is unflattering. overall, this is definitely a dress and the quality is excellent, though a nice touch. for reference, i am 5'6 ", 125 lbs
--------------------
>>> flattering . this is by far one of the best top i have bought. it runs large but is easy to pull up and it is thick enough that i am comfortable wearing it with sandals under it. i usually wear an xs in tops, and i ordered a petite. in person,
>>> flatter

# Finetune GPT2 Language Model (with line-by-line concatenation)

## Create a TextDataLMController object

We will reuse the data and the preprocessings in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main_lm.html) 

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas='Title',
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

Define our tokenizer for Roberta

In [None]:
_tokenizer = AutoTokenizer.from_pretrained('gpt2')
_tokenizer.pad_token = _tokenizer.eos_token



In [None]:
_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

Process and tokenize our dataset

In [None]:
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=True,max_length=block_size)

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 18112
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 4529
    })
})

And set the data collator

In [None]:
tdc.set_data_collator(is_mlm=False)

## Initialize and train GPT2 Model

In [None]:
len(_tokenizer)

50257

In [None]:
_tokenizer.bos_token_id,_tokenizer.eos_token_id

(50256, 50256)

In [None]:
_config = AutoConfig.from_pretrained('gpt2',
                                     n_ctx=block_size,
                                     # just in case...
                                     vocab_size=len(_tokenizer),
                                     bos_token_id=_tokenizer.bos_token_id,
                                     eos_token_id=_tokenizer.eos_token_id,
                                     )
_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 112,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
_model = language_model_init(AutoModelForCausalLM,
                             config=_config,
                             cpoint_path='gpt2',
                             seed=42
                            )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
_model = resize_model_embeddings(_model,_tokenizer)

Create a model controller

In [None]:
controller = ModelLMController(_model,data_store=tdc,seed=42)

And we can start training our model

In [None]:
lr = 1e-4
bs=32
wd=0.01
epochs= 3
warmup_ratio=0.25
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               warmup_ratio=warmup_ratio,
               save_checkpoint=False,
              )

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.985998,0.250999
2,3.160300,2.870805,0.260794
3,3.160300,2.851692,0.262393


Perplexity on validation set: 17.317


In [None]:
controller.trainer.model.save_pretrained('./sample_weights/lm_gpt_model')

## Generate text using model

In [None]:
sentence1 = 'major problem . this is by far one of the '
sentence2 = 'flattering . this is by far one of the '

In [None]:
trained_model = language_model_init(AutoModelForCausalLM,
                                    cpoint_path='./sample_weights/lm_gpt_model',
                                   )

Total parameters: 124439808
Total trainable parameters: 124439808


In [None]:
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)

You can input several raw texts

In [None]:
inp = {'Title':['Major Problem','Flattering'],
        'Review Text': ["This is by far one of the worst ",
                        "This is by far one of the best "]
       }

In [None]:
controller2.predict_raw_text(inp,print_result=True,
                             # huggingface text generation kwargs:
                             num_return_sequences=3,max_new_tokens=50,num_beams=1,do_sample=True
                            )

>>> major problem . this is by far one of the worst things i've ever bought at retailer. the material can be uncomfortable. i wasn't comfortable with it on the hanger. i ordered a small, which was still tight in all the wrong places. the small was way too big. it fit perfectly
>>> major problem . this is by far one of the worst quality dress i have ever purchased byron lars and had to return it. i am usually a size six in retailer tops - 6 and this one is just so huge. the length is really good - it was a bit low-cut and it
>>> major problem . this is by far one of the worst clothes i have bought in years. in my opinion the pattern is just not on par with what the rest of the clothing looks like. the fabric is not even cotton... it was thick, synthetic fabric that made me feel like i was in a hospital
--------------------
>>> flattering . this is by far one of the best jeans i've gotten since i was in highschool. i just ordered blue jeans as well. i've been looking for a pair of jeans 