# Huggingface GPT2 example

In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [2]:
# 12-layer, 768-hidden, 12-heads, 117M parameters.
# OpenAI GPT-2 English model
pretrained_weights = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)

In [6]:
# ids = tokenizer.encode('This is an example of text, and')
# ids = tokenizer.encode('This is an example of text, this is another example of text. :), :/')
ids

[1212,
 318,
 281,
 1672,
 286,
 2420,
 11,
 428,
 318,
 1194,
 1672,
 286,
 2420,
 13,
 1058,
 828,
 1058,
 14]

In [7]:
tokenizer.decode(ids)

'This is an example of text, this is another example of text. :), :/'

In [8]:
print([tokenizer.decode([i]) for i in ids])
# ',' and ', ' are tokenized differently. No decoding for emoji

['This', ' is', ' an', ' example', ' of', ' text', ',', ' this', ' is', ' another', ' example', ' of', ' text', '.', ' :', '),', ' :', '/']


In [20]:
text = "Who was Jim Henson? Jim Henson was a puppeteer who is one of the best person to"
ids = tokenizer.encode(text)
tokenizer.decode(ids)

'Who was Jim Henson? Jim Henson was a puppeteer who is one of the best person to'

In [18]:
import torch

In [21]:
t = torch.LongTensor(ids)[None]
t.shape

torch.Size([1, 22])

In [26]:
preds = model.generate(t,max_length=40) # default length is 20
preds.shape

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 40])

In [27]:
preds[0]

tensor([ 8241,   373,  5395,   367, 19069,    30,  5395,   367, 19069,   373,
          257, 13595, 14471,   263,   508,   318,   530,   286,   262,  1266,
         1048,   284,   670,   351,   287,   262,  2106,   286,   262,   995,
           13,   679,   373,   257,  1049,  8674,    11,   257,  1049,  6260])

In [28]:
tokenizer.decode(preds[0].numpy())

'Who was Jim Henson? Jim Henson was a puppeteer who is one of the best person to work with in the history of the world. He was a great actor, a great writer'

# Small note on different way to tokenize/encode using huggingface

In [None]:
text = ["Hello I'm a single sentence",
                    "And anot`her sentence",
                    "And the very very last one"]

tmp_token = GPT2TokenizerFast.from_pretrained(pretrained_weights)

In [38]:
# function __call__
batch = tmp_token(text) 
print(batch)

{'input_ids': [[15496, 314, 1101, 257, 2060, 6827], [1870, 281, 313, 63, 372, 6827], [1870, 262, 845, 845, 938, 530]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [55]:
# function encode, one at a time
# tmp_token.encode(text[0],truncation=True,max_length=4)
tmp_token.encode(text[0])

[15496, 314, 1101, 257, 2060, 6827]

In [58]:
# function tokenize, one at a time
# tmp_toks = tmp_token.tokenize(text[0],truncation=True,max_length=4)
tmp_toks = tmp_token.tokenize(text[0])
print(tmp_toks)
# convert to ids using convert_tokens_to_ids
print(tmp_token.convert_tokens_to_ids(tmp_toks))

['Hello', 'ĠI', "'m", 'Ġa', 'Ġsingle', 'Ġsentence']
[15496, 314, 1101, 257, 2060, 6827]


# Prepare data for training GPT2 model with fastai

In [29]:
from fastai.text.all import *

In [30]:
path = untar_data(URLs.WIKITEXT_TINY)
path.ls()

(#2) [Path('/home/quan/.fastai/data/wikitext-2/test.csv'),Path('/home/quan/.fastai/data/wikitext-2/train.csv')]

In [31]:
df_train = pd.read_csv(path/'train.csv', header=None)
df_valid = pd.read_csv(path/'test.csv', header=None)
df_train.head()

Unnamed: 0,0
0,"\n = 2013 – 14 York City F.C. season = \n \n The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . \n Nigel Worthington , starting his first full season as York manager , made eight permanent summer signings . By the turn of the year York were only above the relegation z..."
1,"\n = Big Boy ( song ) = \n \n "" Big Boy "" <unk> "" I 'm A Big Boy Now "" was the first single ever recorded by the Jackson 5 , which was released by Steeltown Records in January 1968 . The group played instruments on many of their Steeltown compositions , including "" Big Boy "" . The song was neither a critical nor commercial success , but the Jackson family were delighted with the outcome nonetheless . \n The Jackson 5 would release a second single with Steeltown Records before moving to Motown Records . The group 's recordings at Steeltown Records were thought to be lost , but they were re..."
2,"\n = The Remix ( Lady Gaga album ) = \n \n The Remix is a remix album by American recording artist Lady Gaga . Released in Japan on March 3 , 2010 , it contains remixes of the songs from her first studio album , The Fame ( 2008 ) , and her third extended play , The Fame Monster ( 2009 ) . A revised version of the track list was prepared for release in additional markets , beginning with Mexico on May 3 , 2010 . A number of recording artists have produced the songs , including Pet Shop Boys , Passion Pit and The Sound of Arrows . The remixed versions feature both uptempo and <unk> composit..."
3,"\n = New Year 's Eve ( Up All Night ) = \n \n "" New Year 's Eve "" is the twelfth episode of the first season of the American comedy television series Up All Night . The episode originally aired on NBC in the United States on January 12 , 2012 . It was written by Erica <unk> and was directed by Beth McCarthy @-@ Miller . The episode also featured a guest appearance from Jason Lee as Chris and Reagan 's neighbor and Ava 's boyfriend , Kevin . \n During Reagan ( Christina Applegate ) and Chris 's ( Will <unk> ) first New Year 's Eve game night , Reagan 's competitiveness comes out causing Ch..."
4,"\n = Geopyxis carbonaria = \n \n Geopyxis carbonaria is a species of fungus in the genus Geopyxis , family <unk> . First described to science in 1805 , and given its current name in 1889 , the species is commonly known as the charcoal loving elf @-@ cup , dwarf <unk> cup , <unk> <unk> cup , or pixie cup . The small , <unk> @-@ shaped fruitbodies of the fungus are reddish @-@ brown with a whitish fringe and measure up to 2 cm ( 0 @.@ 8 in ) across . They have a short , tapered stalk . Fruitbodies are commonly found on soil where brush has recently been burned , sometimes in great numbers ...."


In [33]:
# concat all texts
all_texts = np.concatenate([df_train[0].values, df_valid[0].values])
len(all_texts)

662

In [37]:
# print(all_texts[0])

## Create Fastai tokenizer using HuggingFace tokenizer

Build a Fastai's ``Transform`` that will be applied **lazily**. Use Huggingface tokenizer within Fastai tokenizer

In a fastai Transform you can define:

- an encodes method that is applied when you call the transform (a bit like the forward method in a nn.Module)
- a decodes method that is applied when you call the decode method of the transform, if you need to decode anything for showing purposes (like converting ids to a text here)
- a setups method that sets some inner state of the Transform (not needed here so we skip it)

In [48]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        # Note: we don't use tokenizer.encode here as encode function will do additional padding stuff
        # we don't need any post-processing so it's fine to skip it.
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks))
        
    def decodes(self, x): 
        # use fastai TiledStr for showing purposes
        return TitledStr(self.tokenizer.decode(x.cpu().numpy()))
    

In [62]:
tmp = TitledStr('haha')

In [64]:
tmp.capitalize()

'Haha'

In [65]:
tmp.show()

haha


## TfmdLists

In [72]:
splits = [range_of(df_train), list(range(len(df_train), len(all_texts)))]
# splits[0] is indices for train set, splits[1] is indices for validation set

In [69]:
range_of(df_train)[-10:]

[605, 606, 607, 608, 609, 610, 611, 612, 613, 614]

In [70]:
type(all_texts)

numpy.ndarray

In [73]:
tls = TfmdLists(all_texts, TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)

### Review tfmdlists

In [77]:
len(tls),len(tls.train),len(tls.valid)

(662, 615, 47)

In [85]:
# 2 different pipelines with same transformation type for train and valid
tls.train.tfms,hex(id(tls.train.tfms)),tls.valid.tfms,hex(id(tls.valid.tfms))

(Pipeline: TransformersTokenizer,
 '0x7fe413a6e7f0',
 Pipeline: TransformersTokenizer,
 '0x7fe413a6ec10')

In [92]:
tls.valid.items[0][:100]

' \n = Tropical Storm <unk> ( 2008 ) = \n \n Tropical Storm <unk> was the tenth tropical storm of the 20'

In [99]:
tls.train[0] # when indexed, items in tfmdlist are transformed

tensor([220, 198, 796,  ..., 198, 220, 198])

In [90]:
tls.valid[0] # when indexed, items in tfmdlist are transformed

tensor([220, 198, 796,  ..., 198, 220, 198])

In [98]:
# using decodes function you create above
tls.decodes(tls.train[0][:10]),tls.decodes(tls.train[0][-10:])

(' \n = 2013 – 14 York City F.', ' ; <unk> – Forward \n \n')

In [96]:
tls.decodes(tls.valid[0][:10]),tls.decodes(tls.valid[0][-10:])

(' \n = Tropical Storm <unk> ( 2008', ' was caused by the flood. \n \n')

In [103]:
# you can also using show_at to decode
# show_at(tls.train,0)

## Dataloaders

In [104]:
bs,sl = 16,1024
# GPT2 model was trained with sequences of size 1024
dls = tls.dataloaders(bs=bs, seq_len=sl) 
# Note: you can even use seq_len, something that is unique to nlp, into dataloaders, hmm

In [106]:
type(dls)

fastai.data.core.DataLoaders

In [107]:
dls.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"\n = George N. Briggs = \n \n George Nixon Briggs ( April 12, 1796 – September 12, 1861 ) was an American lawyer and politician from Massachusetts. A Whig, Briggs served for twelve years in the United States House of Representatives, and served seven one @-@ year terms as the 19th Governor of Massachusetts, from 1844 to 1851. \n <unk> in rural <unk> New York, Briggs studied law in western Massachusetts, where his civic involvement and successful legal practice preceded <unk> political activity. He was elected to Congress in 1830, where he supported the conservative Whig agenda, serving on the Committee on the Post Office and Post Roads. He was also a regular advocate of temperance, <unk> from all alcohol consumption. \n He was nominated by the Whigs in 1843 to run against Democratic Governor Marcus Morton as part of a Whig bid for more rural votes, and","\n = George N. Briggs = \n \n George Nixon Briggs ( April 12, 1796 – September 12, 1861 ) was an American lawyer and politician from Massachusetts. A Whig, Briggs served for twelve years in the United States House of Representatives, and served seven one @-@ year terms as the 19th Governor of Massachusetts, from 1844 to 1851. \n <unk> in rural <unk> New York, Briggs studied law in western Massachusetts, where his civic involvement and successful legal practice preceded <unk> political activity. He was elected to Congress in 1830, where he supported the conservative Whig agenda, serving on the Committee on the Post Office and Post Roads. He was also a regular advocate of temperance, <unk> from all alcohol consumption. \n He was nominated by the Whigs in 1843 to run against Democratic Governor Marcus Morton as part of a Whig bid for more rural votes, and easily"
1,"covered the song with guitar and violin accompaniment, for her 2007 album In The City + In The Woods. British close harmony trio The <unk> Sisters covered "" Crazy in Love "" for their 2007 album The Rise and Fall of Ruby <unk> ; this was remixed by the electronica jazz outfit The Real Tuesday <unk>. Indie artist <unk> recorded an electronic cover of the song. In 2009, Pattern Is Movement recorded a cover of "" Crazy in Love "", which they claimed was inspired by <unk>'s version ; this cover was included on their 4 / 9 / 2009 <unk> session. Antony and the <unk> released an orchestral version of the song as the b @-@ side to their 2009 single "" <unk> "". \n German group The <unk> covered the song in rockabilly style for their debut album Strike! Back in August 2010. "" Crazy in Love","the song with guitar and violin accompaniment, for her 2007 album In The City + In The Woods. British close harmony trio The <unk> Sisters covered "" Crazy in Love "" for their 2007 album The Rise and Fall of Ruby <unk> ; this was remixed by the electronica jazz outfit The Real Tuesday <unk>. Indie artist <unk> recorded an electronic cover of the song. In 2009, Pattern Is Movement recorded a cover of "" Crazy in Love "", which they claimed was inspired by <unk>'s version ; this cover was included on their 4 / 9 / 2009 <unk> session. Antony and the <unk> released an orchestral version of the song as the b @-@ side to their 2009 single "" <unk> "". \n German group The <unk> covered the song in rockabilly style for their debut album Strike! Back in August 2010. "" Crazy in Love """


## Second way to create tfmlists and dataloaders: preprocess everything before hand (do once and for all)

In [108]:
def tokenize(text):
    toks = tokenizer.tokenize(text)
    return tensor(tokenizer.convert_tokens_to_ids(toks))

tokenized = [tokenize(t) for t in progress_bar(all_texts)]

In [109]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
#         we still account for the case where we get something that's not already tokenized, 
# just in case we were to build a dataset with new texts using this transform.
        return x if isinstance(x, Tensor) else tokenize(x)
        
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [110]:
tls = TfmdLists(tokenized, TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)
dls = tls.dataloaders(bs=bs, seq_len=sl)

# Fine-tuning model

TODO: https://docs.fast.ai/tutorial.transformers.html#Fine-tuning-the-model