In [2]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

In [4]:
# load in pretrained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # add in start and end tokens
model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading: 100%|██████████| 1.04M/1.04M [00:00<00:00, 6.20MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 4.63MB/s]
Downloading: 100%|██████████| 665/665 [00:00<00:00, 111kB/s]
Downloading: 100%|██████████| 548M/548M [00:37<00:00, 14.5MB/s] 


In [None]:
"""
Prepare and load dataset
"""

In [7]:
"""
Train model
"""
def train_model(
    dataset,
    model,
    tokenizer,
    batch_size=16,
    epochs=5,
    lr=2e-5,
    max_seq_len=400,
    warmup_steps=200,
    gpt2_type='gpt2',
    output_dir='',
    output_prefix='wreckgar',
    test_mode=False,
    save_model_on_epoch=False):

    # define the device
    device=torch.device('cuda')

    # pass model to deivce
    model=model.cuda()

    # set model up for training
    model.train()

    # define optimizer
    opt = AdamW(model.parameters(), lr=lr)

    # create schedule with linear lr decay from init -> 0 
    sched = get_linear_schedule_with_warmup(opt,num_training_steps=warmup_steps,num_warmup_steps=-1)

    # create DataLoader to retrieve features and labels from dataset
    # use to reshuffle daya at every epoch to avoid overfitting
    dataload=DataLoader(dataset, batch_size=1,shuffle=True)

    loss=0
    batch_count=0

    # main training loop
    for e in range(epochs):

        print(f'Training epoch {e}')
        print('loss: ',loss)

        for i,inp in tqdm(enumerate(dataload)):

            # pass inp to cuda
            inp=inp.to(device)

            # get model outputs
            out=model(inp,labels=inp)

            #get loss and backprop
            loss=out[0]
            loss.backward()

            # update every so often
            if batch_count % batch_size==0:
                opt.step()
                sched.step()
                opt.zero_grad()
                model.zero_grad()

            # update batch count
            batch_count+=1
    
    return model


model=train_model(dataset, model, tokenizer)

In [8]:
"""
Generate Recipes
"""

def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_lyrics = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Lyric'][i], entry_count=1)
    generated_lyrics.append(x)
  return generated_lyrics

#Run the functions to generate the lyrics
generated_lyrics = text_generation(test_set)

NameError: name 'test_set' is not defined

In [9]:
"""
Performance Evaluation
"""

''