In [1]:
''' imports '''

import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import torch

from torch.nn.utils.rnn import pad_sequence

from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    GPT2Config,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


In [2]:

class LineByLineTextDataset(Dataset):

    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size = 512):

        # no feature cache
        with open(file_path, encoding = 'utf-8') as f:

            lines = [line for line in f.read().splitlines()
                     if (len(line) > 0 and not line.isspace())]

        self.examples = tokenizer.batch_encode_plus(
            lines, add_special_tokens = True, max_length = block_size)['input_ids']

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


In [3]:
''' config parameters '''

# Optional input sequence length after tokenization.
# The training dataset will be truncated in block of this size for training
# Default to the model max input length for single sentence inputs (take into account special tokens)
block_size: int = -1

# Number of updates steps to accumulate before performing a backward/update pass
gradient_accumulation_steps: int = 1

# Max gradient norm
max_grad_norm: float = 1.

# If > 0: set total number of training steps to perform. Override num_train_epochs
max_steps: int = -1

# Linear warmup over warmup_steps
warmup_steps: int = 0


''' init train env '''

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = 1


# Set seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)
    


In [4]:
''' Load pretrained model and tokenizer '''

# load pretrained model and tokenizer
config_class, model_class, tokenizer_class = GPT2Config, GPT2LMHeadModel, GPT2Tokenizer

# init pretrained tokeniser
tokenizer = tokenizer_class.from_pretrained('gpt2', cache_dir = None)


# Our input block size will be the max possible for the model
if block_size <= 0:
    block_size = tokenizer.max_len
else:
    block_size = min(block_size, tokenizer.max_len)


# init config
config = config_class()

# Training new model from scratch
#model = model_class(config = config)

model = model_class.from_pretrained('gpt2', config = config)

# push model to device
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [5]:
''' init dataset '''

# get training dataset
#file_path = '../data/lyrics/eurovision-lyrics-lines.txt'
#file_path = '../data/lyrics/wikitext-2/wiki.train.tokens'
#file_path = '../data/lyrics/eurovision-lyrics-en-lines'
#file_path = '../data/lyrics/eurovision-lyrics-lines-full'
file_path = '../data/lyrics/eurovision-lyrics-en-lines-lrg'

train_dataset = LineByLineTextDataset(
    tokenizer, file_path = file_path, block_size = block_size)
#train_dataset = TextDataset(tokenizer, args, file_path=file_path, block_size = block_size)


In [6]:
""" init dataloader, optimiser """

train_batch_size = 1

def collate(examples: List[torch.Tensor]):

    if tokenizer._pad_token is None:

        return pad_sequence(examples, batch_first = True)

    return pad_sequence(examples, batch_first = True, padding_value = tokenizer.pad_token_id)

# init random sampler on dataset
train_sampler = RandomSampler(train_dataset)

# init dataloader
train_dataloader = DataLoader(
    train_dataset, sampler = train_sampler, batch_size = train_batch_size, collate_fn = collate)


In [7]:
''' init optimiser, learning rate scheduler '''

# The initial learning rate for Adam
learning_rate: float = 5e-5

# Weight decay if we apply some
weight_decay: float = 0.

# Epsilon for Adam optimizer
adam_epsilon: float = 1e-8


# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]

# init optimiser on model parameters
optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate, eps = adam_epsilon)



# set training epochs
num_train_epochs = 3

# get total steps
if max_steps > 0:
    t_total = max_steps
    num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
else:
    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
    

# init learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps = warmup_steps, num_training_steps = t_total)


In [8]:
''' perform training '''

global_step = 0
epochs_trained = 0
steps_trained_in_current_epoch = 0

tr_loss, logging_loss = 0.0, 0.0

model.resize_token_embeddings(len(tokenizer))

# zero gradients
model.zero_grad()

# iterate epochs
for epoch in range(num_train_epochs):

    # get data batch from dataloader
    for step, batch in enumerate(train_dataloader):

        # Skip past any already trained steps if resuming training
        if steps_trained_in_current_epoch > 0:
            steps_trained_in_current_epoch -= 1
            continue


        # unpack batch data
        inputs, labels = (batch, batch)

        # push data to device
        inputs = inputs.to(device)
        labels = labels.to(device)


        # set model to train
        model.train()

        try:
        
            # perform forward pass through model
            outputs = model(inputs, labels=labels)

            # obtain loss; model outputs are always tuple in transformers
            loss = outputs[0]

            # gradient accumulation
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            # backprop loss
            loss.backward()

            # store loss
            tr_loss += loss.item()


            if (step + 1) % gradient_accumulation_steps == 0:

                # perform gradient normalisation
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                # step optimiser
                optimizer.step()

                # Update learning rate schedule
                scheduler.step()


                model.zero_grad()
                global_step += 1

        except:
            pass
            
        if global_step % 100 == 0:
            print(global_step, tr_loss / global_step)

        if max_steps > 0 and global_step > max_steps:
            break
    if max_steps > 0 and global_step > max_steps:
        break
    

100 4.8972459363937375
100 4.8972459363937375
200 4.803398907184601
300 4.768976623217265
400 4.656124718338251
500 4.624998973727227
600 4.584139998654525
700 4.567651105182511
800 4.5199873229116205
900 4.469655289848646
1000 4.4295827458500865
1100 4.395757151625373
1200 4.381694662968318
1300 4.352846398170178
1400 4.322022956780025
1500 4.290419712305069
1600 4.260088528208435
1700 4.252467391455875
1800 4.238117224607203
1900 4.2141116218504155
2000 4.190335895985365
2100 4.17041817341532
2200 4.148430817113681
2300 4.1250156825132995
2400 4.115036492608487
2500 4.094824813306332
2600 4.094347468282168
2700 4.082521997149344
2800 4.078683818908674
2900 4.060866205312054
3000 4.048018341590961
3100 4.0334402210770115
3200 4.019374075257219
3300 4.0105228100626755
3400 3.997905661897624
3500 3.987107959708997
3600 3.9669236762118008
3700 3.9563943721474826
3800 3.9418052757256907
3900 3.9322917248958196
4000 3.923089219920337
4100 3.9204962643958265
4200 3.9098714473608527
4300 3.9

KeyboardInterrupt: 

In [10]:
''' saving '''

model.save_pretrained('../data/lyrics/model-lyrics-best')
#tokenizer.save_pretrained(output_dir)


In [8]:
# Load a trained model and vocabulary that you have fine-tuned
model = model_class.from_pretrained('../data/lyrics/model-en-lyrics-03/')
#tokenizer = tokenizer_class.from_pretrained(args.output_dir)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [145]:
''' perform text generation '''

seeds = ['love', 'peace', 'power', 'flight', 'universe', 'the world', 'imagination', 'passion', 'dreams', 'open']

store = []

for seed in seeds:

    # input prompt to model
    #prompt: str = 'magic.'
    prompt: str = '{}!'.format(seed)

    # max length desired output
    length: int = 15

    # Token at which text generation is stopped
    #stop_token: str = None
    stop_token: str = ','

    # temperature of 1.0 has no effect, lower tend toward greedy sampling
    temperature:float = 1.


    # primarily useful for CTRL model; in that case, use 1.2
    repetition_penalty: float = 1.


    k: int = 0
    p: float = 0.9


    # The number of samples to generate
    num_return_sequences: int = 64


    # Initialize the model and tokenizer
    #model_class, tokenizer_class = (GPT2LMHeadModel, GPT2Tokenizer)

    #tokenizer = tokenizer_class.from_pretrained('gpt2')
    #model = model_class.from_pretrained('../data/lyrics/model-en-lyrics-02')
    #model.to(device)


    MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

    def adjust_length_to_model(length, max_sequence_length):
        if length < 0 and max_sequence_length > 0:
            length = max_sequence_length
        elif 0 < max_sequence_length < length:
            length = max_sequence_length  # No generation bigger than model size
        elif length < 0:
            length = MAX_LENGTH  # avoid infinite loop
        return length

    length = adjust_length_to_model(
        length, max_sequence_length = model.config.max_position_embeddings)


    # encode prompt text, push to device
    prompt_text = prompt
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens = False, return_tensors = "pt")
    encoded_prompt = encoded_prompt.to(device)


    # generate output sequence
    output_sequences = model.generate(

        input_ids = encoded_prompt,

        max_length = length + len(encoded_prompt[0]),
        temperature = temperature,
        top_k = k,
        top_p = p,
        repetition_penalty = repetition_penalty,
        do_sample = True,
        num_return_sequences = num_return_sequences,
    )


    # Remove the batch dimension when returning multiple sequences
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):

        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces = True)

        # Remove all text after the stop token
        text = text[: text.find(stop_token) if stop_token else None]

        # Add the prompt at the beginning of the sequence.
        # Remove the excess text that was used for pre-processing
        total_sequence = (
            prompt_text + text[len(tokenizer.decode(
                encoded_prompt[0], clean_up_tokenization_spaces = True)) :] )

        generated_sequences.append(total_sequence[len(prompt):])
        #print(total_sequence)

    #return generated_sequences
    
    store.append(generated_sequences)
    
    for line in generated_sequences:
        #store.append(line)
        print(line)


 Thank you
 We're the only ones left to rock and roll forever more. Oh yea
 I’ll fly over it once again
 Love your love
 Love! (Love!) Love! Love! Love! Love! Love
 (You're the one I love) Oh
 Rock! Rock! Rock! Rock! Rock! Rock! Rock! Roc
 What are you doing? We’re going up! Why do w
 I'm gonna shine a light in every corner of my dreams
 Beautiful
 What a wonderful world we've got to have a love about? – W
 – Hallelujah for everybody! (Chorus 2000) (Fl
 Love! Love! Love! (Love!) - Aphrodite Lis
 love! love! love! love! love! love! love! lov
 Love! Love! Love! Love! Love! Love! (Let
 Thank you
 I love you! I love you! I love you! You love m
 Love! Love! Love! Love! Love! Love! Love! Lov
 I need you
 Love! Love! Love! Love! Love! Love! Love! Lov
 – rainbow (Ukraine 1984) – Kingdom of Love (Sloveni
 We've never seen them separation! (And it has never been…) Di
 I'm gonna go 'cause I just wanna sing you… To lov
 I’m a puppet on a string
 Alone
 Love will survive; love will survive;...we’ll

In [146]:

for i in range(len(seeds)):

    with open('../data/output/lyrics-{}.txt'.format(seeds[i]), 'w', encoding='utf-8') as file:
        file.writelines('\n'.join(store[i]))


In [109]:
''' perform text generation '''

# input prompt to model
prompt: str = 'fly'

# max length desired output
length: int = 20

# Token at which text generation is stopped
#stop_token: str = None
stop_token: str = ','

# temperature of 1.0 has no effect, lower tend toward greedy sampling
temperature:float = 1.


# primarily useful for CTRL model; in that case, use 1.2
repetition_penalty: float = 1.


k: int = 0
p: float = 0.9


# The number of samples to generate
num_return_sequences: int = 1

    
for i in range(4):

    # Initialize the model and tokenizer
    #model_class, tokenizer_class = (GPT2LMHeadModel, GPT2Tokenizer)

    #tokenizer = tokenizer_class.from_pretrained('gpt2')
    #model = model_class.from_pretrained('../data/lyrics/model-en-lyrics-02')
    #model.to(device)


    MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

    def adjust_length_to_model(length, max_sequence_length):
        if length < 0 and max_sequence_length > 0:
            length = max_sequence_length
        elif 0 < max_sequence_length < length:
            length = max_sequence_length  # No generation bigger than model size
        elif length < 0:
            length = MAX_LENGTH  # avoid infinite loop
        return length

    length = adjust_length_to_model(
        length, max_sequence_length = model.config.max_position_embeddings)


    # encode prompt text, push to device
    prompt_text = prompt
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens = False, return_tensors = "pt")
    encoded_prompt = encoded_prompt.to(device)


    # generate output sequence
    output_sequences = model.generate(

        input_ids = encoded_prompt,

        max_length = length + len(encoded_prompt[0]),
        temperature = temperature,
        top_k = k,
        top_p = p,
        repetition_penalty = repetition_penalty,
        do_sample = True,
        num_return_sequences = num_return_sequences,
    )


    # Remove the batch dimension when returning multiple sequences
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):

        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces = True)

        # Remove all text after the stop token
        text = text[: text.find(stop_token) if stop_token else None]

        # Add the prompt at the beginning of the sequence.
        # Remove the excess text that was used for pre-processing
        total_sequence = (
            prompt_text + text[len(tokenizer.decode(
                encoded_prompt[0], clean_up_tokenization_spaces = True)) :] )

        generated_sequences.append(total_sequence)
        print(total_sequence)
        
        #print(prompt)
        prompt = total_sequence[-5:]

    #return generated_sequences


fly in the interior of your mind and you decide what I should say?’s happened?
ened? Was it a good way to play? Was it just a little nerve? We should have stayed u
yed uidama
idama comand soe comand


In [None]:
# 'I said in Dutch' words to stay together all the time, jeg er viæm kren inte din!