## XLnet rap generator

*(based on https://mccormickml.com/2019/09/19/XLNet-fine-tuning/)*

---


Setup dependencies

In [None]:
!pip install transformers
import torch
import transformers
from transformers import AutoModelWithLMHead, AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split



from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.7MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 38.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 44.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=f7878eba64f9e

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Ingest data


In [None]:
df = pd.read_csv("lyrics.csv", delimiter='\t', header=None, names=['sentence_source'], encoding='latin-1')
df.shape
#df.sample(10)
#remove commas from end of lines
df2 = df.replace({',':''}, regex=True)
df2.size
#FOR TESTING
test_df = df2.sample(10000)


preprocess data for xl-net


In [None]:
sentences = test_df.sentence_source.values
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased", do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
#test_text = [tokenizer(sent, return_tensors="pt", add_special_tokens=True) for sent in sentences]
print("Tokenize the first sentence:")
print(tokenized_texts[1])
MAX_LEN = 64
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

#train_inputs = torch.tensor(test_text)
train_input_ids = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…


Tokenize the first sentence:
['▁', 'Shit', '▁was', '▁do', 'per', '▁than', '▁Whitney', '▁Houston', "'", 's', '▁needs', '▁[', 'S', 'EP', ']', '▁[', 'CL', 'S', ']']


In [None]:
batch_size = 32
#train_data = TensorDataset(tokenized_texts, train_input_ids, train_masks)
train_data = TensorDataset(train_input_ids, train_masks)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


Train loop

In [None]:
model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True)
#model.cuda()




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




In [None]:
from transformers import AdamW
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5)

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    print("Step: %s"%(step))
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    # outputs = model(**inputs, labels=inputs["input_ids"])
    outputs = model(input_ids=b_input_ids, labels=b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    #print(outputs)
    loss = outputs.loss
    logits = outputs.logits
    if(loss is not None):
      train_loss_set.append(loss.item())    
      # Backward pass
      loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    if(loss is not None):
      tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Step: 0, batch: [tensor([[   17, 25254, 15707,  ...,     0,     0,     0],
        [  336, 15683,   220,  ...,     0,     0,     0],
        [  209,    42,    85,  ...,     0,     0,     0],
        ...,
        [  839,    44,   343,  ...,     0,     0,     0],
        [16822, 14378,  5305,  ...,     0,     0,     0],
        [ 3039,  4132,   113,  ...,     0,     0,     0]]), tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])]
Step: 1, batch: [tensor([[   84,    24,   153,  ...,     0,     0,     0],
        [  130,    35,    26,  ...,     0,     0,     0],
        [  306,   138,     9,  ...,     0,     0,     0],
        ...,
        [   17,  4119, 26234,  ...,     0,     0,     0],
        [ 8711,   110,   611,  ...,     0,     0,     0],
        [   17,    26, 10367,  ...,     0,   

Epoch:  25%|██▌       | 1/4 [2:28:21<7:25:03, 8901.01s/it]

Train loss: 0.025705808474601934
Step: 0, batch: [tensor([[22073,   106,    92,  ...,     0,     0,     0],
        [  238,    26,    23,  ...,     0,     0,     0],
        [  209,   608,  3067,  ...,     0,     0,     0],
        ...,
        [  183,    44,    26,  ...,     0,     0,     0],
        [ 2797,    24,   153,  ...,     0,     0,     0],
        [   17,    10,   936,  ...,     0,     0,     0]]), tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])]
Step: 1, batch: [tensor([[ 346,   35,  182,  ...,    0,    0,    0],
        [5008,  497,   24,  ...,    0,    0,    0],
        [  84,   26,   23,  ...,    0,    0,    0],
        ...,
        [  35,   26,   98,  ...,    0,    0,    0],
        [ 169,   17, 7007,  ...,    0,    0,    0],
        [ 330, 9049,   56,  ...,    0,    

Epoch:  50%|█████     | 2/4 [4:56:44<4:56:43, 8901.61s/it]

Train loss: 0.0015958524734033242
Step: 0, batch: [tensor([[   17, 19001, 14045,  ...,     0,     0,     0],
        [   35,    39, 12872,  ...,     0,     0,     0],
        [   32,   205,   252,  ...,     0,     0,     0],
        ...,
        [  200,    71,   144,  ...,     0,     0,     0],
        [  130,    73,   356,  ...,     0,     0,     0],
        [   35,   287,  3134,  ...,     0,     0,     0]]), tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])]
Step: 1, batch: [tensor([[   35,    26,    98,  ...,     0,     0,     0],
        [ 2797,    17,    12,  ...,     0,     0,     0],
        [  346,   195,     9,  ...,     0,     0,     0],
        ...,
        [16837,   166,  1606,  ...,     0,     0,     0],
        [   35,   101,     0,  ...,     0,     0,     0],
        [  

Epoch:  75%|███████▌  | 3/4 [7:24:40<2:28:13, 8893.98s/it]

Train loss: 0.0007786145884288511
Step: 0, batch: [tensor([[11910,   126, 11016,  ...,     0,     0,     0],
        [  147,   318,    22,  ...,     0,     0,     0],
        [  448,     9,   323,  ...,     0,     0,     0],
        ...,
        [11870,   110,    31,  ...,     0,     0,     0],
        [ 7705,   597,  4571,  ...,     0,     0,     0],
        [   35,   435,  1907,  ...,     0,     0,     0]]), tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])]
Step: 1, batch: [tensor([[ 1370,    94,  6692,  ...,     0,     0,     0],
        [   35,    26,    98,  ...,     0,     0,     0],
        [  169,  1266,  3998,  ...,     0,     0,     0],
        ...,
        [  169,    17,  7007,  ...,     0,     0,     0],
        [   35,   210,    24,  ...,     0,     0,     0],
        [  

Epoch: 100%|██████████| 4/4 [9:52:24<00:00, 8886.17s/it]

Train loss: 0.0004398366964857143





In [None]:
#DEBUG TRAIN LOOP
print(tokenized_texts)



Generate using finetuned model


In [None]:
# prompt = tokenizer("<|endoftext|> All my friends are ", return_tensors="pt", add_special_tokens=True)
# generate_id = model.generate(
#     input_ids = prompt.input_ids,
#     do_sample = True,
#     max_length = 50,
#     num_beams = None,
#     top_k = 5,
#     top_p = 0.85,
#     eos_token_id = None,
#     temperature = 0.9,
#     repetition_penalty = 1.2
# )
# print(tokenizer.decode(generate_id.reshape(-1)))

PADDING_TEXT = """They tryna be cray (Mm, yeah)
They tryna be cray (Mm, yeah)
She wanna meet Carti (Carti)
That bitch is a Barbie (Yeah)
I'ma fuck these hoes (Ooh)
I'm on 730
Got a brand new pack like Kid Cudi (Brand new)
I smoke dope like Kid Cudi
Push up and get the slugs from me (Slime slime)
I'm with all the shits (Slime slime)
She wanna meet Carti (Carti)
That bitch is a Barbie (Yeah)
I'ma fuck these hoes (Ooh)
I'm on 730
Got a brand new pack like Kid Cudi (Brand new)
I smoke dope like Kid Cudi
Push up and get the<eod> </s> <eos>"""

prompt = "All my friends are "
inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")

prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.99, top_k=25)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
print(generated)


All my friends are iends are pretty (Mo) I see that I can be (Oh) and they are (Oh) I want to get to (Oh) I need to know (Oh). (..). I don't


In [None]:
PADDING_TEXT = """They tryna be cray (Mm, yeah)
They tryna be cray (Mm, yeah)
She wanna meet Carti (Carti)
That bitch is a Barbie (Yeah)
I'ma fuck these hoes (Ooh)
I'm on 730
Got a brand new pack like Kid Cudi (Brand new)
I smoke dope like Kid Cudi
Push up and get the slugs from me (Slime slime)
I'm with all the shits (Slime slime)
She wanna meet Carti (Carti)
That bitch is a Barbie (Yeah)
I'ma fuck these hoes (Ooh)
I'm on 730
Got a brand new pack like Kid Cudi (Brand new)
I smoke dope like Kid Cudi
Push up and get the<eod> </s> <eos>"""

for i in range(9):
  prompt = "This is a test " + str(i) 
  inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")

  prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
  outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.99, top_k=25)
  generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
  print(generated)

This is a test 0 a test 0.................................................
This is a test 1 a test 1. I get the (!). I do. I want to meet Carti (Carti) I want to be cray. I want to get to this (!). I want to be cray
This is a test 2 a test 2. I smoke the () I like that () That bitch is a bitch () The () I'm on 730. I smoke a group of () (). I smoke ()
This is a test 3 a test 3, a (see). It is on a boy (Oh) It is on the (".....) We are on 730. The bitch is a Barbie ("....) She
This is a test 4 a test 4. But I'm on 730. I can see that they have the hoes (...) They donna be cray (...) They tryna be (() They could be
This is a test 5 a test 5. I have the (!!) to get the (?) We are on 730. I (?) (?) (?) That girl is a (?) I shoot that (?)
This is a test 6 a test 6-------------------------------------------------
This is a test 7 a test 730. I'm on 700 and can get the (()) I'm on (). The () that () is a Barbie (Yes). I smoke dope like Kid Cudi 
This is a test 8 a test 8. He was a new one (Li

Generate with out the box xl-net

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer

model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True)
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
# Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology

PADDING_TEXT = """They tryna be cray (Mm, yeah)
They tryna be cray (Mm, yeah)
She wanna meet Carti (Carti)
That bitch is a Barbie (Yeah)
I'ma fuck these hoes (Ooh)
I'm on 730
Got a brand new pack like Kid Cudi (Brand new)
I smoke dope like Kid Cudi
Push up and get the slugs from me (Slime slime)
I'm with all the shits (Slime slime)
She wanna meet Carti (Carti)
That bitch is a Barbie (Yeah)
I'ma fuck these hoes (Ooh)
I'm on 730
Got a brand new pack like Kid Cudi (Brand new)
I smoke dope like Kid Cudi
Push up and get the<eod> </s> <eos>"""

prompt = "All my friends are "
inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")

prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.99, top_k=25)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]



In [None]:
print(generated)

All my friends are iends are a Barbie. So I'm with all the craps because they ((in the past) had) been with a (un)) ((un))) and the (in))) ((un
