In [23]:
import comet_ml
import os
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
 
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import datasets
import numpy as np
import re
from dotenv import load_dotenv

In [2]:
load_dotenv()
COMET_API_KEY = os.getenv("COMET_API_KEY")
os.environ["COMET_LOG_ASSETS"] = "True"

In [3]:
experiment = comet_ml.Experiment(api_key=COMET_API_KEY, project_name='rap-lyrics-generator-llm')

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/artaasd95/rap-lyrics-generator-llm/d6e564e1191c4988b5c3fd2254aa96ec



In [4]:
experiment.log_parameters({
    "num_train_epochs": 5,
    "per_device_train_batch_size": 2,
    # Add any other relevant hyperparameters here
})

In [5]:
rap_lyrics_train_dataset = datasets.load_dataset("nateraw/rap-lyrics-v2", split='train')
#rap_lyrics_train_dataset = rap_lyrics_train_dataset[:int(len(rap_lyrics_train_dataset)*0.7)]
#rap_lyrics_test_dataset = datasets.load_dataset("nateraw/rap-lyrics-v2", split='train')[int(len(rap_lyrics_train_dataset)*0.7):]



In [7]:
model_name = "gpt2"  # You could use a larger model like gpt2-medium for better performance
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(model_name)
#model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id


In [8]:
def merge_prompt_and_completion(example):
    prompt = example["text"]
    completion = example["completion"]
    # You can optionally add special tokens or formatting
    merged_text = f"User: {prompt}\nAssistant: {completion}"
    return {"merged_text": merged_text}

merged_dataset = rap_lyrics_train_dataset.map(merge_prompt_and_completion)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["merged_text"], truncation=True, max_length=512)

In [10]:
tokenized_dataset = merged_dataset.map(tokenize_function, batched=True)


In [24]:
instruction_template = "### Human:"
response_template = "### Assistant:"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)



In [20]:
def transform_to_sft_structure(example):
    merged_text = example['merged_text']
    # Split the User and Assistant text
    user_text = merged_text.split('User: ')[1].split('\nAssistant: ')[0].strip()
    assistant_text = merged_text.split('\nAssistant: ')[1].strip()
    
    # Format in the desired template structure
    formatted_text = (
        f"### Human: {user_text}\n"
        f"### Assistant: {assistant_text}"
    )
    return {'text': formatted_text}


In [21]:
transformed_dataset = merged_dataset.map(transform_to_sft_structure, remove_columns=['text', 'num_tokens', 'completion', 'merged_text'])

Map:   0%|          | 0/7319 [00:00<?, ? examples/s]

In [None]:
sft_trainer = SFTTrainer(
    model=model,
    train_dataset=transformed_dataset,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    data_collator=collator,
    
)

sft_trainer.train()

sft_trainer.save_pretrained("checkpoints/gpt2-sft-checkpoints")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/7319 [00:00<?, ? examples/s]

  super().__init__(
[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


  0%|          | 0/2745 [00:00<?, ?it/s]