In [5]:
import comet_ml
import os
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, DataCollatorWithPadding
from trl import SFTTrainer
import datasets
from transformers import logging, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from itertools import combinations
import numpy as np
import re
from dotenv import load_dotenv

In [None]:
load_dotenv()
COMET_API_KEY = os.getenv("COMET_API_KEY")
os.environ["COMET_LOG_ASSETS"] = "True"

In [None]:
experiment = comet_ml.Experiment(api_key=COMET_API_KEY.comet, project_name='rap-lyrics-generator-llm')

In [None]:
experiment.log_parameters({
    "num_train_epochs": 5,
    "per_device_train_batch_size": 2,
    # Add any other relevant hyperparameters here
})

In [6]:
rap_lyrics_train_dataset = datasets.load_dataset("nateraw/rap-lyrics-v2", split='train')
#rap_lyrics_train_dataset = rap_lyrics_train_dataset[:int(len(rap_lyrics_train_dataset)*0.7)]
#rap_lyrics_test_dataset = datasets.load_dataset("nateraw/rap-lyrics-v2", split='train')[int(len(rap_lyrics_train_dataset)*0.7):]

README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [8]:
model_name = "gpt2"  # You could use a larger model like gpt2-medium for better performance
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

In [19]:
def merge_prompt_and_completion(example):
    prompt = example["text"]
    completion = example["completion"]
    # You can optionally add special tokens or formatting
    merged_text = f"User: {prompt}\nAssistant: {completion}"
    return {"merged_text": merged_text}

merged_dataset = rap_lyrics_train_dataset.map(merge_prompt_and_completion)

Map:   0%|          | 0/7319 [00:00<?, ? examples/s]

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["merged_text"], truncation=True, max_length=512)

In [23]:
tokenized_dataset = merged_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/7319 [00:00<?, ? examples/s]

In [24]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=512)

In [None]:
sft_trainer = SFTTrainer(
    model_name=model_name,
    train_dataset=tokenized_dataset,
    dataset_text_field="merged_text",  # Tells SFTTrainer which field to read
    max_seq_length=512,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # Pass Comet logging config here
    train_kwargs={
       "output_dir": "./trl-sft-checkpoints",
       "per_device_train_batch_size": 4,
       "num_train_epochs": 1,
       "logging_steps": 50,
       "evaluation_strategy": "steps",
       "report_to": ["comet_ml"],  # <-- Send logs to Comet.ml
       "run_name": "my-comet-run", # Optional name for your run
    },
)

sft_trainer.train()

sft_trainer.save_pretrained("checkpoints/gpt2-sft-checkpoints")