In [16]:
!pip install tqdm



In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import os

# Define valid characters
valid_chars = set("abcdefghijklmnopqrstuvwxyz0123456789 .,?!'-")
print(f"Valid characters: {sorted(list(valid_chars))}, Total: {len(valid_chars)}")

# Define the directory containing the lyrics files
lyrics_dir = '/kaggle/input/poetry/'

# Load all lyrics files into a list of cleaned songs
songs = []
for filename in os.listdir(lyrics_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(lyrics_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            lyrics = f.read().lower()
            # Filter out invalid characters
            cleaned_lyrics = ''.join(ch for ch in lyrics if ch in valid_chars)
            if cleaned_lyrics:  # Only add non-empty songs
                songs.append(cleaned_lyrics)

print(f"Loaded {len(songs)} songs")

# Create vocabulary from valid characters in songs
all_chars = set()
for song in songs:
    all_chars.update(song)
chars = sorted(list(all_chars))
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}
vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}, Characters: {chars}")

# Encode each song and create sequences
sequence_length = 50
sequences = []
targets = []
for song in songs:
    encoded_song = [char_to_idx[ch] for ch in song]
    # Only create sequences within this song
    for i in range(0, len(encoded_song) - sequence_length):
        seq = encoded_song[i:i + sequence_length]
        target = encoded_song[i + 1:i + sequence_length + 1]
        sequences.append(seq)
        targets.append(target)

# Convert to PyTorch tensors
sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Custom Dataset
class LyricsDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

# Create dataset and dataloader
dataset = LyricsDataset(sequences, targets)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

print(f"Dataset size: {len(dataset)} sequences")

Valid characters: [' ', '!', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'], Total: 43
Loaded 49 songs
Vocabulary size: 43, Characters: [' ', '!', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [None]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.hidden_dim = hidden_dim
    
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden
    
    def init_hidden(self, batch_size):
        device = next(self.parameters()).device  # Get the device of the model
        return torch.zeros(1, batch_size, self.hidden_dim, device=device)

In [None]:
embedding_dim = 128
hidden_dim = 256

In [None]:
rnn_model = RNNModel(vocab_size, embedding_dim, hidden_dim)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import os

def train(model, dataloader, epochs=10, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        print(f"Using {num_gpus} GPUs!")
        model = nn.DataParallel(model)
    
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        total_loss = 0
        
        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}', leave=True)
        for i, (seq, target) in enumerate(progress_bar):
            batch_size = seq.size(0)
            if num_gpus > 1:
                base_size = batch_size // num_gpus
                remainder = batch_size % num_gpus
                effective_batch_size = base_size + (1 if remainder > 0 else 0)
            else:
                effective_batch_size = batch_size
            
            hidden = model.module.init_hidden(effective_batch_size) if isinstance(model, nn.DataParallel) else model.init_hidden(batch_size)
            
            seq, target = seq.to(device), target.to(device)
            
            if isinstance(hidden, tuple):
                hidden = tuple(h.detach() for h in hidden)
            else:
                hidden = hidden.detach()
            
            optimizer.zero_grad()
            output, hidden = model(seq, hidden)
            
            if output.dim() == 2:
                loss = criterion(output, target[:, -1])
            else:
                expected_batch_size = output.size(0)
                if expected_batch_size != batch_size:
                    target = target[:expected_batch_size]
                loss = criterion(output.view(-1, vocab_size), target.view(-1))
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

            if (epoch + 1) % 2 == 0:
                torch.save(model.state_dict(), f'/kaggle/working/rnn_model_weights_epoch_{epoch+1}.pt')
                        
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}')

# Assuming rnn_model, dataset, and vocab_size are defined
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
print("Training RNN...")
train(rnn_model, dataloader)

In [None]:
torch.save(rnn_model.state_dict(), '/kaggle/working/rnn.pt')

In [None]:
def generate_lyrics(model, start_letter, max_length=100):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    # Start with the given letter
    input_seq = torch.tensor([char_to_idx[start_letter]], dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    generated = [start_letter]
    
    with torch.no_grad():
        for _ in range(max_length):
            output, hidden = model(input_seq, hidden)
            probs = torch.softmax(output[-1], dim=-1)
            next_char_idx = torch.multinomial(probs, 1).item()
            next_char = idx_to_char[next_char_idx]
            generated.append(next_char)
            
            # Prepare next input
            input_seq = torch.tensor([[next_char_idx]], dtype=torch.long).to(device)
            
            if next_char == '\n':  # Stop at newline (optional)
                break
    
    return ''.join(generated)

In [None]:
start_letter = 'h'
print("RNN Generated Lyrics:")
print(generate_lyrics(rnn_model, start_letter))

In [None]:
start_letter = 'a'
print("RNN Generated Lyrics:")
print(generate_lyrics(rnn_model, start_letter))

In [None]:
print(vocab_size)

In [2]:
import pickle

def save_char_mappings(char_to_idx, idx_to_char, filepath):
    """Saves the character mappings to a pickle file."""
    mappings = {'char_to_idx': char_to_idx, 'idx_to_char': idx_to_char}
    with open(filepath, 'wb') as f:
        pickle.dump(mappings, f)

In [3]:
filepath = '/kaggle/working/char_mappings.pkl'

In [4]:
save_char_mappings(char_to_idx, idx_to_char, filepath)