In [None]:
!pip install fastai
!pip install transformers

In [None]:
from fastai.text.all import *
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [None]:
pretrained_weights = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)


In [None]:
path = './poems'

In [None]:
poems = get_text_files(path, folders = ['verse','tyburn'])
print("There are",len(poems),"poems in the dataset")

In [None]:
ballads = get_text_files(path, folders = ['ballad'])
print("There are",len(ballads),"ballads in the dataset")

In [None]:
txt = poems[0].open().read(); #read the first file
print(txt)

## Prepare the data

In [None]:
# ballads = L(o.open().read() for o in ballads) # to make things easy we will gather all texts in one numpy array
L(o.open() for o in ballads)

In [None]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt
  
all_ballads = flatten(ballads)

In [None]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks))
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [None]:
splits = [range_of(70), range(100)] # use a 70/30 split
tls = TfmdLists(all_ballads, TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)

In [None]:
show_at(tls.train, 0)

In [None]:
bs,sl = 4,256
dls = tls.dataloaders(bs=bs, seq_len=sl)

In [None]:
dls.show_batch(max_n=2)

## Fine-tuning the model

In [None]:
class DropOutput(Callback):
    def after_pred(self): self.learn.pred = self.pred[0]

In [None]:
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), cbs=[DropOutput], metrics=Perplexity()).to_fp16()

In [None]:
learn.validate()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(1, 1e-4)

## Poem Generation

In [None]:
prompt = 'love is ridiculous' # create an initial text prompt to start your generated text
prompt_ids = tokenizer.encode(prompt)
inp = tensor(prompt_ids)[None].cuda()
inp.shape

In [None]:
preds = learn.model.generate(inp, max_length=60, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(preds[0].cpu().numpy(), skip_special_tokens=True))

In [None]:
prompt = "I don't know what I would do"
prompt_ids = tokenizer.encode(prompt)
inp = tensor(prompt_ids)[None].cuda()
preds = learn.model.generate(inp, max_length=60, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(preds[0].cpu().numpy(), skip_special_tokens=True))