# Style-Based Rap Lyrics Generation

This notebook implements style-based rap lyrics generation that can mimic specific rappers' styles based on reference lyrics.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import numpy as np
from tqdm import tqdm
import os
import json

In [None]:
class RapStyleDataset(Dataset):
    def __init__(self, dataset_path, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Load the dataset
        with open(dataset_path, 'r') as f:
            self.data = json.load(f)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Format the input with style reference
        style_reference = item['style_reference']
        prompt = item['prompt']
        
        # Create the full input text
        input_text = f"Style Reference: {style_reference}\nPrompt: {prompt}"
        
        # Tokenize
        tokens = self.tokenizer(
            input_text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'style_reference': style_reference,
            'prompt': prompt
        }

In [None]:
class StyleBasedGenerator:
    def __init__(self, model_path, tokenizer_path):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        self.model = AutoModelForCausalLM.from_pretrained(model_path)
        
    def generate_lyrics(self, style_reference, prompt, max_length=200, num_return_sequences=1):
        # Format the input
        input_text = f"Style Reference: {style_reference}\nPrompt: {prompt}"
        
        # Tokenize
        inputs = self.tokenizer(input_text, return_tensors="pt")
        
        # Generate
        outputs = self.model.generate(
            inputs['input_ids'],
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        
        # Decode and return
        generated_texts = []
        for output in outputs:
            text = self.tokenizer.decode(output, skip_special_tokens=True)
            # Extract only the generated part (after the prompt)
            generated_part = text.split(prompt)[-1].strip()
            generated_texts.append(generated_part)
        
        return generated_texts

In [None]:
def prepare_style_dataset(lyrics_data, output_path):
    """Prepare a dataset for style-based training.
    
    Args:
        lyrics_data: List of dictionaries containing 'artist', 'lyrics', and 'style_reference'
        output_path: Path to save the processed dataset
    """
    processed_data = []
    
    for item in lyrics_data:
        # Extract first few lines as prompt
        lines = item['lyrics'].split('\n')
        prompt = '\n'.join(lines[:2])
        
        processed_item = {
            'style_reference': item['style_reference'],
            'prompt': prompt,
            'artist': item['artist']
        }
        processed_data.append(processed_item)
    
    # Save to JSON
    with open(output_path, 'w') as f:
        json.dump(processed_data, f, indent=2)

In [None]:
# Example usage of the style-based generator
def generate_style_based_rap(style_reference, prompt):
    # Initialize the generator
    generator = StyleBasedGenerator(
        model_path='checkpoints/dpo_trained_model',
        tokenizer_path='checkpoints/dpo_trained_model'
    )
    
    # Generate lyrics
    generated_lyrics = generator.generate_lyrics(
        style_reference=style_reference,
        prompt=prompt,
        max_length=200,
        num_return_sequences=3
    )
    
    return generated_lyrics

In [None]:
# Example usage
style_reference = "Eminem's style: Fast-paced, complex wordplay, aggressive tone, and dark humor"
prompt = "Look, I was gonna go easy on you not to hurt your feelings\nBut I'm only going to get this one chance"

generated_lyrics = generate_style_based_rap(style_reference, prompt)
print("Generated Lyrics:")
for i, lyrics in enumerate(generated_lyrics, 1):
    print(f"\nVersion {i}:")
    print(lyrics)