In [1]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, embedding_weights, hidden_dim, vocab_size, num_layers=1):
        """
        Initializes the LSTM model.
        
        Parameters:
        - embedding_weights: Pre-trained Word2Vec embeddings.
        - hidden_dim: The number of features in the hidden state `h` of the LSTM.
        - vocab_size: The size of the vocabulary.
        - num_layers: Number of recurrent layers (default=1).
        
        The input to the model is expected to be a batch of word indices,
        and the output is a batch of predictions for the next word.
        """
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Embedding layer with pre-trained weights
        self.word_embeddings = nn.Embedding.from_pretrained(embedding_weights, freeze=True)

        # The LSTM takes word embeddings as inputs and outputs hidden states
        self.lstm = nn.LSTM(embedding_weights.shape[1], hidden_dim, num_layers, batch_first=True)

        # The linear layer maps from hidden state space to vocabulary space
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_word_indices):
        """
        Defines the forward pass of the model.
        
        Parameters:
        - input_word_indices: A batch of word indices as input.
        
        Returns:
        - output: The model's predictions for the next word.
        """
        embeddings = self.word_embeddings(input_word_indices)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)
        return output


In [2]:
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec embeddings
word2vec_path = 'models\GoogleNews-vectors-negative300.bin.gz'  # Update this path
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Prepare embedding weights in the format expected by PyTorch
vocab_size = len(word2vec_model.key_to_index)
embedding_dim = word2vec_model.vector_size
embedding_weights = torch.zeros(vocab_size, embedding_dim)

for word, idx in word2vec_model.key_to_index.items():
    embedding_weights[idx] = torch.tensor(word2vec_model[word])



In [5]:
# save embeddings model
torch.save(embedding_weights, 'word2vec_weights.pt')

In [6]:
# extracting stoi and itos
stoi = word2vec_model.key_to_index
itos = word2vec_model.index_to_key

# saving stoi and itos4
import pickle
with open('models/stoi.pkl', 'wb') as f:
    pickle.dump(stoi, f)
with open('models/itos.pkl', 'wb') as f:
    pickle.dump(itos, f)

In [9]:
import torch
import torch.nn.functional as F
import numpy as np

class LyricsGenerator:
    def __init__(self, model, vocab, device='cpu'):
        """
        Initializes the LyricsGenerator.

        Parameters:
        - model: The trained LSTM model for lyrics generation.
        - vocab: A mapping from words to indices and indices to words (vocab.stoi and vocab.itos).
        - device: The device to run the generation on ('cpu' or 'cuda').
        """
        self.model = model
        self.vocab = vocab
        self.device = device

    def sample_next_word(self, logits, temperature=1.0):
        """
        Samples the next word from the logits with a given temperature.

        Parameters:
        - logits: The logits output by the model.
        - temperature: Controls the randomness of the sampling. Higher values lead to more random outputs.

        Returns:
        - index of the sampled word.
        """
        probabilities = F.softmax(logits / temperature, dim=-1)
        word_index = torch.multinomial(probabilities, 1).item()
        return word_index

    def generate(self, start_word, max_words=50, max_words_per_line=10, temperature=1.0):
        """
        Generates lyrics starting from a given word.

        Parameters:
        - start_word: The word to start generating from.
        - max_words: The maximum number of words in the generated lyrics.
        - max_words_per_line: The maximum number of words per line.
        - temperature: Controls the randomness of the sampling.

        Returns:
        - A string containing the generated lyrics.
        """
        self.model.eval()  # Set the model to evaluation mode
        words = [start_word]
        current_word_index = torch.tensor([self.vocab.stoi[start_word]], device=self.device)

        for _ in range(max_words - 1):
            with torch.no_grad():
                logits = self.model(current_word_index.unsqueeze(0))[:, -1, :]
                next_word_index = self.sample_next_word(logits, temperature)
                next_word = self.vocab.itos[next_word_index]
                words.append(next_word)
                current_word_index = torch.tensor([next_word_index], device=self.device)

                if len(words) % max_words_per_line == 0:
                    words.append('\n')

            if words[-1] == '<eos>':  # Assuming <eos> is the end-of-sentence token
                break

        return ' '.join(words).replace(' \n ', '\n')


In [None]:
# load stoi and itos
with open('models/stoi.pkl', 'rb') as f:
    stoi = pickle.load(f)
with open('models/itos.pkl', 'rb') as f:
    itos = pickle.load(f)

# load pre-trained Word2Vec embeddings
embedding_weights = torch.load('models/word2vec_weights.pt')
vocab_size, embedding_dim = embedding_weights.shape

In [14]:
class Vocabulary:
    """
    A mapping from words to indices and indices to words.
    """
    def __init__(self, stoi, itos):
        self.stoi = stoi
        self.itos = itos
    
    def __call__(self, word):
        if word in self.stoi:
            return self.stoi[word]
        else:
            return self.stoi['<unk>']
        
    def __len__(self):
        return len(self.stoi)


vocab = Vocabulary(stoi, itos)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# Initialize the LSTM model with Word2Vec weights and corrected vocabulary size
lstm_model = LSTMModel(embedding_weights=embedding_weights, hidden_dim=256, vocab_size=vocab_size, num_layers=1)
lstm_model.to(device)

# Instantiate the LyricsGenerator with the corrected mappings
generator = LyricsGenerator(lstm_model, vocab, device)
initial_word = 'the'  # Starting word for song generation
song = generator.generate(initial_word)
print(song)


the Norway_Statoil_STL.OL profoundly_relearn_penitence KnowledgeTree_repository http://www.fnbcorporation.com Madrid_vibrant_nightlife Eeny_meeny Mizer Coniglio lights_hash_Spinning
Inc._NYSE_TPX Fujitsu_Siemens Adm._Horatio_Nelson setting CNW_QLT Jimmey Seaman_Apprentice ROBERT_LOPEZ By_KIM_BRIGGEMAN
Astute_readers George_Foulidis luxurious_lifestyles Fred_Kaweesi AAMI_Hobart sophisticated_cyberattacks Nuhu_Gagara Diesel_Emission Other_Monroe_abilia
Pelonomi Ballintubber Jillie_Cooper Bon_Marche Rordam Snow_Aaron_McKie Mwebesa YMF DANS
Commissioner_Kevin_MacCurtain Pigeon_Detectives dropped_GBU_##s Graysmith WADB Broadcom_BCM####_SoC Chashma_barrage CV_Mosby Bifoss
Assemblyman_Morse_Arberry NVIDIA_GeForce_GPUs Mysteel_Research Paul_Sancya_FILE


In [None]:
# Summary writer will output to ./runs/ directory by default
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter('runs/lyrics_generator_experiment')


In [38]:
config = {
    "epochs": 100,
    "batch_size": 64,
    "learning_rate": 0.001,
    "hidden_dim": 256,
    "num_layers": 1,
    "embedding_dim": 100,  # Assuming we know the embedding dimension
    "vocab_size": len(vocab),  # Make sure this matches the actual vocab size
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "batch_size": 64,
    "shuffle": False,
}


In [16]:
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm.auto import tqdm

def train_model(model, train_loader, config):
    """
    Trains the LSTM model on the given dataset.
    
    Parameters:
    - model: The LSTM model to train.
    - train_loader: DataLoader for the training dataset.
    - config: Dictionary containing configuration parameters.
    """
    model.train()  # Switch model to training mode
    optimizer = Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss()
    model.to(config["device"])
    
    for epoch in range(config["epochs"]):
        total_loss = 0
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
        for i, (input_words, target_words) in progress_bar:
            input_words, target_words = input_words.to(config["device"]), target_words.to(config["device"])
            
            # Forward pass
            output = model(input_words)
            loss = criterion(output.view(-1, config["vocab_size"]), target_words.view(-1))
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_description(f"Epoch {epoch+1} Loss: {total_loss/(i+1):.4f}")
            
        # Log the average loss for the epoch
        avg_loss = total_loss / len(train_loader)
        writer.add_scalar('training_loss', avg_loss, epoch+1)
        
        print(f"Epoch {epoch+1} Completed. Avg Loss: {avg_loss:.4f}")


In [None]:
# save model weights
torch.save(lstm_model.state_dict(), 'models/lstm_model_weights.pth')

In [33]:
import pandas as pd
import re
import nltk
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric
from tqdm.auto import tqdm

# nltk.download('punkt')  # Uncomment if nltk's punkt tokenizer hasn't been downloaded yet

word2vec_path = 'models/GoogleNews-vectors-negative300.bin.gz'  # Adjust path as necessary
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

bos_token = 'BOS'
eos_token = 'EOS'
eof_token = 'EOF'


In [34]:
def preprocess_and_tokenize_lyrics(lyrics):
    """
    Preprocesses and tokenizes lyrics, reduces OOV by leveraging Word2Vec's vocabulary.

    Parameters:
    lyrics (str): The raw lyrics as a single string.

    Returns:
    tuple: A tuple containing the list of word indices, the list of vectors, and the OOV rate.
    """
    # Preprocess lyrics
    lyrics = lyrics.lower().replace('&', eos_token)
    lyrics = f'{bos_token} {lyrics} {eof_token}'
    lyrics = strip_punctuation(lyrics)
    lyrics = strip_numeric(lyrics)
    lyrics = re.sub(r'\(.*?\)|\[.*?\]', '', lyrics)  # Remove text inside parentheses and brackets
    lyrics = lyrics.split()

    # Tokenization and vectorization
    word_ids = [word2vec_model.key_to_index.get(word, word2vec_model.key_to_index.get('UNK')) for word in lyrics]
    vectors = [word2vec_model.get_vector(word, None) for word in lyrics if word in word2vec_model]
    oov_rate = sum(1 for word in lyrics if word not in word2vec_model) / len(lyrics)

    return word_ids, vectors, oov_rate


In [35]:
def parse_lyrics_to_dataset(csv_path):
    """
    Parses the CSV containing lyrics into a structured dataset with minimal OOV.

    Parameters:
    csv_path (str): Path to the CSV file with lyrics.

    Returns:
    dict: A dictionary where keys are 'song_name artist' and values are word indices lists.
    list: A list containing OOV rates for each song.
    """
    train = pd.read_csv(csv_path)
    train['artist'] = train['artist'].str.strip()
    train['song'] = train['song'].str.strip()

    lyrics_dict = {}
    oov_rates = []

    for i, row in tqdm(train.iterrows(), total=len(train)):
        word_ids, vectors, oov_rate = preprocess_and_tokenize_lyrics(row['lyrics'])
        key = f"{row['song']} {row['artist']}"
        lyrics_dict[key] = word_ids
        oov_rates.append(oov_rate)

    return lyrics_dict, oov_rates


In [36]:
csv_path = 'data/lyrics_train_set2.csv'  # Update to your CSV path
lyrics_dict, oov_rates = parse_lyrics_to_dataset(csv_path)

# Analyze OOV Rates
average_oov_rate = sum(oov_rates) / len(oov_rates)
print(f"Average OOV Rate: {average_oov_rate * 100:.2f}%")


  0%|          | 0/600 [00:00<?, ?it/s]

Average OOV Rate: 6.94%


In [None]:
# save lyrics dict
import pickle
with open('models/lyrics_dict.pkl', 'wb') as f:
    pickle.dump(lyrics_dict, f)

In [37]:
from torch.utils.data import Dataset, DataLoader
import torch

class LyricsDataset(Dataset):
    def __init__(self, lyrics_dict):
        """
        Initializes the dataset with preprocessed lyrics.
        
        Parameters:
        lyrics_dict (dict): A dictionary where keys are 'song_name artist' and values are lists of word indices.
        """
        self.lyrics_indices = [indices for indices in lyrics_dict.values()]
        self.all_indices = [idx for sublist in self.lyrics_indices for idx in sublist]
    
    def __len__(self):
        """Returns the total number of word indices in the dataset."""
        return len(self.all_indices) - 1  # Subtract 1 because we use a look-ahead of 1 for targets
    
    def __getitem__(self, index):
        """
        Returns a tuple (current_word_index, next_word_index) for training.
        
        Parameters:
        index (int): The index of the current word.
        
        Returns:
        tuple: A tuple of tensors (current_word_index, next_word_index).
        """
        return (torch.tensor(self.all_indices[index], dtype=torch.long), 
                torch.tensor(self.all_indices[index + 1], dtype=torch.long))


In [39]:
dataset = LyricsDataset(lyrics_dict)
train_loader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=config['shuffle'])


<torch.utils.data.dataloader.DataLoader at 0x1c2aab40490>

In [42]:
# Train the model
train_model(lstm_model, train_loader, config)

  0%|          | 0/3021 [00:00<?, ?it/s]

KeyboardInterrupt: 