In [None]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk
nltk.download('punkt')
from tqdm import tqdm
import time

In [None]:
start_time = time.time()

In [None]:
lines = pd.read_csv('eminem_lyrics.csv',index_col=0)

In [None]:
lines.head()

In [None]:
lines.shape

In [None]:
# Tokenization
def tokenize_lines(df):
    tokenized_lines = []
    for line in df['lines']:
        tokens = word_tokenize(line)
        tokenized_lines.append(tokens)
    return tokenized_lines

tokenized_lines = tokenize_lines(lines)

In [None]:
# Creating Vocabulary
def create_vocab(tokenized_lines):
    all_words = [word for tokens in tokenized_lines for word in tokens]
    vocab = Counter(all_words)
    return {word: i+1 for i, (word, _) in enumerate(vocab.items())}, len(vocab) + 1

word2idx, vocab_size = create_vocab(tokenized_lines)
idx2word = {idx: word for word, idx in word2idx.items()}


In [None]:
# Preparing sequences for training
def create_sequences(tokenized_lines, word2idx, seq_length=50):
    sequences = []
    for line in tokenized_lines:
        encoded_line = [word2idx[word] for word in line if word in word2idx]
        for i in range(seq_length, len(encoded_line)):
            sequence = encoded_line[i-seq_length:i+1]
            sequences.append(sequence)
    return sequences

sequences = create_sequences(tokenized_lines, word2idx)

In [None]:
class LyricsDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        x, y = torch.tensor(sequence[:-1]), torch.tensor(sequence[1:])
        return x, y

In [None]:
# Creating Dataset and DataLoader
dataset = LyricsDataset(sequences)
total_sequences = len(sequences)  # Total number of sequences you have
batch = 32  # As defined in DataLoader
steps_per_epoch = -(-total_sequences // batch)  # Ceiling division
print(steps_per_epoch)
data_loader = DataLoader(dataset, batch_size=batch, shuffle=True)

In [None]:
class LyricsGeneratorModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LyricsGeneratorModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru(x)
        x = self.fc(x)
        return x

In [None]:
embedding_dim = 128
hidden_dim = 1024
model = LyricsGeneratorModel(vocab_size, embedding_dim, hidden_dim)

In [None]:
def train_model(model, data_loader, epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters())

    epoch_losses = []  # List to store average loss per epoch

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        num_batches = 0
        progress_bar = tqdm(data_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=False)

        for inputs, targets in progress_bar:
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.transpose(1, 2), targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1
            progress_bar.set_postfix(loss=f'{loss.item():.4f}')

        average_loss = total_loss / num_batches
        epoch_losses.append(average_loss)

    return epoch_losses

In [None]:
model = LyricsGeneratorModel(vocab_size, embedding_dim, hidden_dim)
epoch_losses = train_model(model, data_loader, epochs=10)

In [None]:
# Saving model for future deployment
#torch.save(model.state_dict(), 'rnn_lyrics_model_10e.pth')

# loading saved model 
#model = model.load_state_dict(torch.load('rnn_lyrics_model_10e.pth'))

In [None]:
def generate_lyrics(model, seed_text, word2idx, idx2word, sequence_length=50, num_words=100):
    model.eval()  # Set the model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initial sequence
    generated = seed_text

    for _ in range(num_words):
        # Tokenize the current input text
        input_tokens = [word2idx.get(word, 0) for word in word_tokenize(generated[-sequence_length:])]

        # Pad sequence if shorter than required
        if len(input_tokens) < sequence_length:
            input_tokens = [0] * (sequence_length - len(input_tokens)) + input_tokens

        # Convert to tensor and add batch dimension
        input_tensor = torch.tensor([input_tokens], dtype=torch.long).to(device)

        # Predict the next word
        with torch.no_grad():
            prediction = model(input_tensor)

        # Get the last word from the prediction
        predicted_idx = prediction[0, -1].argmax().item()
        predicted_word = idx2word.get(predicted_idx, '<UNK>')

        # Append to the generated text
        generated += ' ' + predicted_word

    return generated


In [None]:
# Generating the output

seed_text = "I remember"
generated_lyrics = generate_lyrics(model, seed_text, word2idx, idx2word, sequence_length=50, num_words=100)
print(generated_lyrics)

In [None]:
end_time = time.time()

execution_time_seconds = end_time - start_time
execution_time_minutes = execution_time_seconds / 60  # Convert seconds to minutes

print(f"Execution time RNN model: {execution_time_minutes:.2f} minutes")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), epoch_losses, marker='o', label='Training Loss')
plt.title("Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.legend()
plt.grid(True)
plt.show()