In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd 

In [5]:
# Define a custom dataset class
class LyricsDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        genre = row['Genre']
        year = row['Release_Year']
        artist = row['Artist']
        lyrics = row['Lyrics']

        # Format the input text
        input_text = f"Genre: {genre}, Release_Year: {year}, Artist: {artist}, Lyrics: {lyrics}"
        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()  # For text generation, labels are the same as input_ids
        }

In [16]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token # Use the EOS token as the PAD token
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [32]:
df = pd.read_csv(r'C:\Users\carlf\Documents\GitHub\lyrics_generator\data\05_lyrics_genius_\lyrics_genius.csv')
df = df.dropna(subset=['Lyrics'])

In [18]:
# Prepare the dataset
dataset = LyricsDataset(df, tokenizer, max_length=128)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [19]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)



In [20]:
# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [21]:
# Training loop
for epoch in range(3):  # Adjust the number of epochs as needed
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

KeyboardInterrupt: 

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./lyrics_generator")
tokenizer.save_pretrained("./lyrics_generator")

In [None]:

# Function to generate lyrics
def generate_lyrics(genre=None, year=None, artist=None, max_length=100):
    # Build the input prompt dynamically
    input_text = "Lyrics:"
    if genre:
        input_text = f"Genre: {genre}, " + input_text
    if year:
        input_text = f"Release_Year: {year}, " + input_text
    if artist:
        input_text = f"Artist: {artist}, " + input_text
    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    # Generate lyrics
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    # Decode and return the output
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example of generating lyrics
print(generate_lyrics(genre="Rock", year=1990, artist="Nirvana"))