In [7]:
import comet_ml
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
import datasets
from transformers import logging, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from itertools import combinations
import numpy as np
import api_keys
import re

In [8]:
experiment = comet_ml.Experiment(api_key=api_keys.comet, project_name='rap-lyrics-generator-llm')

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : poised_hamburger_607
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/artaasd95/rap-lyrics-generator-llm/13404e1c0d7241b4baf92d2c6d045d77
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     num_train_epochs            : 3
[1;38;5;39mCOMET INFO:[0m     per_device_train_batch_size : 2
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     environment details      : 1
[1;38;5;39mCOMET INFO:[0m     filename                 : 1
[1;38;5;39mCOMET INFO:[0m     git metadata             : 1
[1



In [9]:
experiment.log_parameters({
    "num_train_epochs": 5,
    "per_device_train_batch_size": 2,
    # Add any other relevant hyperparameters here
})

In [11]:
rap_lyrics_train_dataset = datasets.load_dataset("nateraw/rap-lyrics-v2", split='train')
#rap_lyrics_train_dataset = rap_lyrics_train_dataset[:int(len(rap_lyrics_train_dataset)*0.7)]
#rap_lyrics_test_dataset = datasets.load_dataset("nateraw/rap-lyrics-v2", split='train')[int(len(rap_lyrics_train_dataset)*0.7):]

In [15]:
model_name = "gpt2"  # You could use a larger model like gpt2-medium for better performance
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

In [16]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=8092)

# Tokenize dataset
tokenized_train_dataset = rap_lyrics_train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Create a data collator for dynamic batching
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# tokenized_test_dataset = rap_lyrics_train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# data_collator_test = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/7319 [00:00<?, ? examples/s]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    report_to="comet_ml",  # Ensure training logs are sent to Comet
)

# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    #eval_dataset=tokenized_test_dataset,
    compute_metrics=None,  # Add compute_metrics function if needed
)

# Fine-tune the model
trainer.train()

[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


  0%|          | 0/18300 [00:00<?, ?it/s]

{'loss': 3.2603, 'grad_norm': 15.812793731689453, 'learning_rate': 4.9726775956284156e-05, 'epoch': 0.03}
{'loss': 2.988, 'grad_norm': 11.727437019348145, 'learning_rate': 4.945355191256831e-05, 'epoch': 0.05}
{'loss': 3.1803, 'grad_norm': 7.573428630828857, 'learning_rate': 4.918032786885246e-05, 'epoch': 0.08}


In [None]:
# Save the fine-tuned model
model.save_pretrained('./fine_tuned_rap_model')
tokenizer.save_pretrained('./fine_tuned_rap_model')

In [None]:
# metrics = trainer.evaluate(eval_dataset=tokenized_test_dataset)
# experiment.log_metrics(metrics)

In [None]:
experiment.end()