In [8]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, embedding_weights, hidden_dim, vocab_size, num_layers=1):
        """
        Initializes the LSTM model.
        
        Parameters:
        - embedding_weights: Pre-trained Word2Vec embeddings.
        - hidden_dim: The number of features in the hidden state `h` of the LSTM.
        - vocab_size: The size of the vocabulary.
        - num_layers: Number of recurrent layers (default=1).
        
        The input to the model is expected to be a batch of word indices,
        and the output is a batch of predictions for the next word.
        """
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Embedding layer with pre-trained weights
        self.word_embeddings = nn.Embedding.from_pretrained(embedding_weights, freeze=True)

        # The LSTM takes word embeddings as inputs and outputs hidden states
        self.lstm = nn.LSTM(embedding_weights.shape[1], hidden_dim, num_layers, batch_first=True)

        # The linear layer maps from hidden state space to vocabulary space
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_word_indices):
        """
        Defines the forward pass of the model.
        
        Parameters:
        - input_word_indices: A batch of word indices as input.
        
        Returns:
        - output: The model's predictions for the next word.
        """
        embeddings = self.word_embeddings(input_word_indices)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)
        return output


In [1]:
from gensim.models import KeyedVectors

# # Load pre-trained Word2Vec embeddings
# word2vec_path = 'models\GoogleNews-vectors-negative300.bin.gz'  # Update this path
# word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
word2vec_path = "models/word2vec-google-news-300.model"
word2vec_model = KeyedVectors.load(word2vec_path)

# Prepare embedding weights in the format expected by PyTorch
vocab_size = len(word2vec_model.key_to_index)
embedding_dim = word2vec_model.vector_size
embedding_weights = torch.zeros(vocab_size, embedding_dim)

for word, idx in word2vec_model.key_to_index.items():
    embedding_weights[idx] = torch.tensor(word2vec_model[word])



In [4]:
# save embeddings model
torch.save(embedding_weights, 'models/word2vec_weights.pt')

In [5]:
# extracting stoi and itos
stoi = word2vec_model.key_to_index
itos = word2vec_model.index_to_key

# saving stoi and itos4
import pickle
with open('models/stoi.pkl', 'wb') as f:
    pickle.dump(stoi, f)
with open('models/itos.pkl', 'wb') as f:
    pickle.dump(itos, f)

In [10]:
import torch
import torch.nn.functional as F
import numpy as np

class LyricsGenerator:
    def __init__(self, model, vocab, device='cpu'):
        """
        Initializes the LyricsGenerator.

        Parameters:
        - model: The trained LSTM model for lyrics generation.
        - vocab: A mapping from words to indices and indices to words (vocab.stoi and vocab.itos).
        - device: The device to run the generation on ('cpu' or 'cuda').
        """
        self.model = model
        self.vocab = vocab
        self.device = device

    def sample_next_word(self, logits, temperature=1.0):
        """
        Samples the next word from the logits with a given temperature.

        Parameters:
        - logits: The logits output by the model.
        - temperature: Controls the randomness of the sampling. Higher values lead to more random outputs.

        Returns:
        - index of the sampled word.
        """
        probabilities = F.softmax(logits / temperature, dim=-1)
        word_index = torch.multinomial(probabilities, 1).item()
        return word_index

    def generate(self, start_word, max_words=50, max_words_per_line=10, temperature=1.0):
        """
        Generates lyrics starting from a given word.

        Parameters:
        - start_word: The word to start generating from.
        - max_words: The maximum number of words in the generated lyrics.
        - max_words_per_line: The maximum number of words per line.
        - temperature: Controls the randomness of the sampling.

        Returns:
        - A string containing the generated lyrics.
        """
        self.model.eval()  # Set the model to evaluation mode
        words = [start_word]
        current_word_index = torch.tensor([self.vocab.stoi[start_word]], device=self.device)

        for _ in range(max_words - 1):
            with torch.no_grad():
                logits = self.model(current_word_index.unsqueeze(0))[:, -1, :]
                next_word_index = self.sample_next_word(logits, temperature)
                next_word = self.vocab.itos[next_word_index]
                words.append(next_word)
                current_word_index = torch.tensor([next_word_index], device=self.device)

                if len(words) % max_words_per_line == 0:
                    words.append('\n')

            if words[-1] == '<eos>':  # Assuming <eos> is the end-of-sentence token
                break

        return ' '.join(words).replace(' \n ', '\n')


In [12]:
import pickle

In [13]:
# load stoi and itos
with open('models/stoi.pkl', 'rb') as f:
    stoi = pickle.load(f)
with open('models/itos.pkl', 'rb') as f:
    itos = pickle.load(f)

# load pre-trained Word2Vec embeddings
embedding_weights = torch.load('models/word2vec_weights.pt')
vocab_size, embedding_dim = embedding_weights.shape

In [14]:
class Vocabulary:
    """
    A mapping from words to indices and indices to words.
    """
    def __init__(self, stoi, itos):
        self.stoi = stoi
        self.itos = itos
    
    def __call__(self, word):
        if word in self.stoi:
            return self.stoi[word]
        else:
            return self.stoi['<unk>']
        
    def __len__(self):
        return len(self.stoi)


vocab = Vocabulary(stoi, itos)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Initialize the LSTM model with Word2Vec weights and corrected vocabulary size
lstm_model = LSTMModel(embedding_weights=embedding_weights, hidden_dim=256, vocab_size=vocab_size, num_layers=1)
lstm_model.to(device)

# Instantiate the LyricsGenerator with the corrected mappings
generator = LyricsGenerator(lstm_model, vocab, device)
initial_word = 'the'  # Starting word for song generation
song = generator.generate(initial_word)
print(song)


In [None]:
# Summary writer will output to ./runs/ directory by default
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter('runs/lyrics_generator_experiment')


In [None]:
config = {
    "epochs": 100,
    "batch_size": 64,
    "learning_rate": 0.001,
    "hidden_dim": 256,
    "num_layers": 1,
    "embedding_dim": 100,  # Assuming we know the embedding dimension
    "vocab_size": len(vocab),  # Make sure this matches the actual vocab size
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "batch_size": 64,
    "shuffle": False,
}


In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm.auto import tqdm

def train_model(model, train_loader, config):
    """
    Trains the LSTM model on the given dataset.
    
    Parameters:
    - model: The LSTM model to train.
    - train_loader: DataLoader for the training dataset.
    - config: Dictionary containing configuration parameters.
    """
    model.train()  # Switch model to training mode
    optimizer = Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss()
    model.to(config["device"])
    
    for epoch in range(config["epochs"]):
        total_loss = 0
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
        for i, (input_words, target_words) in progress_bar:
            input_words, target_words = input_words.to(config["device"]), target_words.to(config["device"])
            
            # Forward pass
            output = model(input_words)
            loss = criterion(output.view(-1, config["vocab_size"]), target_words.view(-1))
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_description(f"Epoch {epoch+1} Loss: {total_loss/(i+1):.4f}")
            
        # Log the average loss for the epoch
        avg_loss = total_loss / len(train_loader)
        writer.add_scalar('training_loss', avg_loss, epoch+1)
        
        print(f"Epoch {epoch+1} Completed. Avg Loss: {avg_loss:.4f}")


In [None]:
# save model weights
torch.save(lstm_model.state_dict(), 'models/lstm_model_weights.pth')

In [33]:
import pandas as pd
import re
import nltk
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric
from tqdm.auto import tqdm

# nltk.download('punkt')  # Uncomment if nltk's punkt tokenizer hasn't been downloaded yet

word2vec_path = 'models/GoogleNews-vectors-negative300.bin.gz'  # Adjust path as necessary
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

bos_token = 'BOS'
eos_token = 'EOS'
eof_token = 'EOF'


In [34]:
def preprocess_and_tokenize_lyrics(lyrics):
    """
    Preprocesses and tokenizes lyrics, reduces OOV by leveraging Word2Vec's vocabulary.

    Parameters:
    lyrics (str): The raw lyrics as a single string.

    Returns:
    tuple: A tuple containing the list of word indices, the list of vectors, and the OOV rate.
    """
    # Preprocess lyrics
    lyrics = lyrics.lower().replace('&', eos_token)
    lyrics = f'{bos_token} {lyrics} {eof_token}'
    lyrics = strip_punctuation(lyrics)
    lyrics = strip_numeric(lyrics)
    lyrics = re.sub(r'\(.*?\)|\[.*?\]', '', lyrics)  # Remove text inside parentheses and brackets
    lyrics = lyrics.split()

    # Tokenization and vectorization
    word_ids = [word2vec_model.key_to_index.get(word, word2vec_model.key_to_index.get('UNK')) for word in lyrics]
    #vectors = [word2vec_model.get_vector(word, None) for word in lyrics if word in word2vec_model]
    oov_rate = sum(1 for word in lyrics if word not in word2vec_model) / len(lyrics)

    return word_ids, vectors, oov_rate


In [35]:
def parse_lyrics_to_dataset(csv_path):
    """
    Parses the CSV containing lyrics into a structured dataset with minimal OOV.

    Parameters:
    csv_path (str): Path to the CSV file with lyrics.

    Returns:
    dict: A dictionary where keys are 'song_name artist' and values are word indices lists.
    list: A list containing OOV rates for each song.
    """
    train = pd.read_csv(csv_path)
    train['artist'] = train['artist'].str.strip()
    train['song'] = train['song'].str.strip()

    lyrics_dict = {}
    oov_rates = []

    for i, row in tqdm(train.iterrows(), total=len(train)):
        word_ids, vectors, oov_rate = preprocess_and_tokenize_lyrics(row['lyrics'])
        key = f"{row['song']} {row['artist']}"
        lyrics_dict[key] = word_ids
        oov_rates.append(oov_rate)

    return lyrics_dict, oov_rates


In [36]:
csv_path = 'data/lyrics_train_set2.csv'  # Update to your CSV path
lyrics_dict, oov_rates = parse_lyrics_to_dataset(csv_path)

# Analyze OOV Rates
average_oov_rate = sum(oov_rates) / len(oov_rates)
print(f"Average OOV Rate: {average_oov_rate * 100:.2f}%")


  0%|          | 0/600 [00:00<?, ?it/s]

Average OOV Rate: 6.94%


In [None]:
# save lyrics dict
import pickle
with open('data/lyrics_dict.pkl', 'wb') as f:
    pickle.dump(lyrics_dict, f)

In [37]:
from torch.utils.data import Dataset, DataLoader
import torch

class LyricsDataset(Dataset):
    def __init__(self, lyrics_dict):
        """
        Initializes the dataset with preprocessed lyrics.
        
        Parameters:
        lyrics_dict (dict): A dictionary where keys are 'song_name artist' and values are lists of word indices.
        """
        self.lyrics_indices = [indices for indices in lyrics_dict.values()]
        self.all_indices = [idx for sublist in self.lyrics_indices for idx in sublist]
    
    def __len__(self):
        """Returns the total number of word indices in the dataset."""
        return len(self.all_indices) - 1  # Subtract 1 because we use a look-ahead of 1 for targets
    
    def __getitem__(self, index):
        """
        Returns a tuple (current_word_index, next_word_index) for training.
        
        Parameters:
        index (int): The index of the current word.
        
        Returns:
        tuple: A tuple of tensors (current_word_index, next_word_index).
        """
        return (torch.tensor(self.all_indices[index], dtype=torch.long), 
                torch.tensor(self.all_indices[index + 1], dtype=torch.long))


In [None]:
# create dataset and dataloader
lyrics_dataset = LyricsDataset(lyrics_dict)
train_loader = DataLoader(lyrics_dataset, batch_size=config["batch_size"], shuffle=True)

In [None]:
# Train the model
train_model(lstm_model, train_loader, config)

In [18]:
# save model weights
torch.save(lstm_model.state_dict(), 'models/lstm_model_weights_x.pth')

In [22]:
# evaluatiing the model
# load model weights
lstm_model = LSTMModel(embedding_weights=embedding_weights, hidden_dim=256, vocab_size=vocab_size, num_layers=1)
lstm_model.load_state_dict(torch.load('models/lstm_model_weights_x.pth'))
lstm_model.to(device)

LSTMModel(
  (word_embeddings): Embedding(3000000, 300)
  (lstm): LSTM(300, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=3000000, bias=True)
)

In [23]:
# create a new instance of the LyricsGenerator
vocab = Vocabulary(stoi, itos)
generator = LyricsGenerator(lstm_model, vocab, device)


In [24]:
# generate a song
initial_word = 'BOS'  # Starting word for song generation
song = generator.generate(initial_word)

In [39]:
import gensim
import nltk
from nltk.translate.bleu_score import sentence_bleu
from scipy.spatial.distance import cosine
import numpy as np
from gensim.models import KeyedVectors
from gensim.similarities import WmdSimilarity

# Load your pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz', binary=True)

def calculate_similarity_metrics(text1, text2, word2vec_model):
    """
    Calculates cosine similarity, WMD, and BLEU score between two texts.
    
    Parameters:
    text1 (str): The first text.
    text2 (str): The second text.
    word2vec_model (gensim.models.KeyedVectors): Pre-trained Word2Vec model.
    
    Returns:
    dict: A dictionary containing the cosine similarity, WMD, and BLEU score.
    """
    # Tokenize and vectorize texts
    tokens1 = text1.lower().split()
    tokens2 = text2.lower().split()
    
    vectors1 = [word2vec_model[word] for word in tokens1 if word in word2vec_model]
    vectors2 = [word2vec_model[word] for word in tokens2 if word in word2vec_model]
    
    # Cosine similarity
    # Avoid division by zero and ensure valid vectors for cosine similarity calculation
    if len(vectors1) > 0 and len(vectors2) > 0:
        mean_vector1 = np.mean(vectors1, axis=0)
        mean_vector2 = np.mean(vectors2, axis=0)
        cosine_sim = 1 - cosine(mean_vector1, mean_vector2)
    else:
        cosine_sim = float('nan')
    
    # WMD
    wmd = word2vec_model.wmdistance(tokens1, tokens2)
    
    # BLEU score
    # Note: `sentence_bleu` expects a list of reference sentences, where each reference is tokenized
    bleu_score = sentence_bleu([tokens1], tokens2)
    
    return {
        'Cosine Similarity': cosine_sim,
        'WMD': wmd,
        'BLEU Score': bleu_score
    }


In [36]:
import pandas as pd
# load trainset
train = pd.read_csv('data/lyrics_train_set2.csv')

# get lyrics of a song
lyrics = train.loc[0, 'lyrics']
lyrics

'goodbye norma jean & though i never knew you at all & you had the grace to hold yourself & while those around you crawled & they crawled out of the woodwork & and they whispered into your brain & they set you on the treadmill & and they made you change your name & and it seems to me you lived your life & like a candle in the wind & never knowing who to cling to & when the rain set in & and i would liked to have known you & but i was just a kid & your candle burned out long before & your legend ever did & loneliness was tough & the toughest role you ever played & hollywood created a superstar & and pain was the price you paid & even when you died & oh the press still hounded you & all the papers had to say & was that marilyn was found in the nude & and it seems to me you lived your life & like a candle in the wind & never knowing who to cling to & when the rain set in & and i would liked to have known you & but i was just a kid & your candle burned out long before & your legend ever di

In [None]:
# calculate similarity metrics
cos, bleu = calculate_similarity_metrics(lyrics, song)
print(f"Cosine Similarity: {cos:.4f}")
print(f"BLEU Score: {bleu:.4f}")


In [41]:
import numpy as np
import pretty_midi
import os
import pickle

def extract_features(midi_path, max_length=2048):
    """
    Extracts musical features from a MIDI file.

    Parameters:
    midi_path (str): Path to the MIDI file.
    max_length (int): Maximum length for normalized pitch and velocity arrays.

    Returns:
    np.ndarray: A combined feature vector including normalized pitches, velocities, and mean chroma.
    """
    midi_data = pretty_midi.PrettyMIDI(midi_path)

    # Normalized Pitches
    pitches = [note.pitch for instrument in midi_data.instruments for note in instrument.notes]
    pitches_normalized = np.array(pitches) / 127.0  # MIDI pitch range
    pitches_feature = np.zeros(max_length)
    pitches_feature[:len(pitches_normalized)] = pitches_normalized[:max_length]

    # Normalized Velocities
    velocities = [note.velocity for instrument in midi_data.instruments for note in instrument.notes]
    velocities_normalized = np.array(velocities) / 127.0  # MIDI velocity range
    velocities_feature = np.zeros(max_length)
    velocities_feature[:len(velocities_normalized)] = velocities_normalized[:max_length]

    # Mean Chroma
    chroma = midi_data.get_chroma()
    chroma_mean = np.mean(chroma, axis=1)

    # Combine features into a single vector
    feature_vector = np.concatenate([pitches_feature, velocities_feature, chroma_mean])

    return feature_vector

def process_midi_files(directory, output_file):
    """
    Processes all MIDI files in a directory, extracting features and saving them.

    Parameters:
    directory (str): Directory containing MIDI files.
    output_file (str): File path to save the extracted feature vectors.
    """
    feature_vectors = {}

    for filename in os.listdir(directory):
        if filename.endswith(".mid"):
            midi_path = os.path.join(directory, filename)
            try:
                # Parsing artist and song names from the filename
                artist, song = os.path.basename(filename).replace('.mid', '').lower().split('_-_')
                song = f"{song} {artist}".replace('_', ' ')
                feature_vector = extract_features(midi_path)
                feature_vectors[song] = feature_vector
            except Exception as e:
                print(f"Error processing {filename}: {e}")

    # Save the feature vectors to a file
    with open(output_file, 'wb') as f:
        pickle.dump(feature_vectors, f)

# Example usage
midi_directory = 'data/midi_files'
output_file = 'midi_feature_vectors.pkl'
process_midi_files(midi_directory, output_file)




Error processing Aaron_Neville_-_Tell_It_Like_It_Is.mid: data byte must be in range 0..127
Error processing Beastie_Boys_-_Girls.mid: Could not decode key with 1 flats and mode 255
Error processing Billy_Joel_-_Movin'_Out.mid: data byte must be in range 0..127
Error processing Billy_Joel_-_Pressure.mid: data byte must be in range 0..127
Error processing Brian_McKnight_-_On_The_Down_Low.mid: 
Error processing Dan_Fogelberg_-_Leader_of_the_Band.mid: Could not decode key with 4 flats and mode 255
Error processing David_Bowie_-_Lazarus.mid: Could not decode key with 16 sharps and mode 1
Error processing Ed_Sheeran_-_Thinking_Out_Loud_-_Violin.mid: too many values to unpack (expected 2)
Error processing Eric_Clapton_-_wonderful_tonight_-_live_extnd_version_@jiji@.mid: too many values to unpack (expected 2)



### Processing MIDI Files (Outside the Code)

Before feeding MIDI files into the feature extraction code, consider the following preprocessing steps to ensure the data is clean and consistent:

2. **Normalization**: Ensure MIDI files are using a standard format and quantization to maintain consistency in timing and note representation.

3. **Instrument Filtering**: Optionally, filter out specific instruments or tracks (e.g., drums) that may not contribute to the desired features.

4. **Handling Polyphony**: Decide on how to handle polyphonic music, where multiple notes are played simultaneously. This could influence pitch and chroma feature extraction.


### Process Overview for Training the Autoencoder with MIDI Data

1. **Feature Extraction**: MIDI files are preprocessed to extract meaningful features, such as normalized pitches, velocities, and chroma mean. These features are stored in a dictionary, mapping song identifiers to feature vectors.

2. **Dataset Preparation**: A custom `Dataset` class, `VectorDataset`, is created to

 handle the feature vectors. This class facilitates loading the data into a PyTorch `DataLoader` for efficient batch processing during training.

3. **Autoencoder Architecture**: The autoencoder comprises an encoder and a decoder. The encoder compresses the input feature vectors into a smaller embedding, capturing the essential information. The decoder then attempts to reconstruct the original feature vector from this embedding.

4. **Training Loop**: The autoencoder is trained using a mean squared error (MSE) loss function to minimize the difference between the original feature vectors and their reconstructions. The training process involves forward propagation to compute the loss, followed by backward propagation to update the model's weights.

5. **Embedding Extraction**: After training, the encoder part of the autoencoder can be used independently to convert input feature vectors into compact embeddings. These embeddings will serve as inputs to the LSTM model for lyric generation, providing a musical context based on the MIDI data.


In [47]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pickle
from torch.utils.tensorboard import SummaryWriter

class Autoencoder(nn.Module):
    def __init__(self, input_size, embedding_dim=1024):
        """
        Initializes the Autoencoder model with a specified input size and embedding dimension.

        Parameters:
        - input_size (int): The size of the input feature vector.
        - embedding_dim (int): The size of the embedding vector.
        """
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 4096),
            nn.ReLU(True),
            nn.Linear(4096, 2048),
            nn.ReLU(True),
            nn.Linear(2048, embedding_dim),
            nn.ReLU(True),
        )
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 2048),
            nn.ReLU(True),
            nn.Linear(2048, 4096),
            nn.ReLU(True),
            nn.Linear(4096, input_size),
            nn.Sigmoid(),  # Sigmoid activation to ensure output values are between 0 and 1
        )

    def forward(self, x):
        """
        Forward pass of the Autoencoder. Encodes and then decodes the input.

        Parameters:
        - x (torch.Tensor): Input tensor.

        Returns:
        - torch.Tensor: Reconstructed input tensor.
        """
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def encode(self, x):
        """
        Encodes the input into a lower-dimensional embedding.

        Parameters:
        - x (torch.Tensor): Input tensor.

        Returns:
        - torch.Tensor: Encoded (embedded) tensor.
        """
        return self.encoder(x)

class VectorDataset(Dataset):
    def __init__(self, vector_dict):
        """
        Initializes the dataset with vectors extracted from MIDI files.

        Parameters:
        - vector_dict (dict): Dictionary containing feature vectors.
        """
        self.vectors = list(vector_dict.values())

    def __len__(self):
        """
        Returns the number of items in the dataset.
        """
        return len(self.vectors)

    def __getitem__(self, idx):
        """
        Retrieves an item by its index.

        Parameters:
        - idx (int): Index of the item.

        Returns:
        - torch.Tensor: Feature vector as a tensor.
        """
        vector = self.vectors[idx]
        return torch.tensor(vector, dtype=torch.float)

def train_autoencoder(autoencoder, dataloader, criterion, optimizer, num_epochs=100, device=torch.device("cpu")):
    """
    Trains the autoencoder model.

    Parameters:
    - autoencoder (Autoencoder): The autoencoder model.
    - dataloader (DataLoader): DataLoader for the dataset.
    - criterion (torch.nn.modules.loss): Loss function.
    - optimizer (torch.optim.Optimizer): Optimizer.
    - num_epochs (int): Number of epochs to train.
    - device (torch.device): Device to train on.
    """
    # write to tensorboard
    writer = SummaryWriter('runs/autoencoder_midi')
    autoencoder.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for data in dataloader:
            inputs = data.to(device)
            optimizer.zero_grad()
            outputs = autoencoder(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Log the average loss for the epoch
        epoch_loss = running_loss / len(dataloader)
        writer.add_scalar('training_loss', epoch_loss, epoch+1)
        if (epoch+1) % 1000 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
   




In [None]:
# Load the feature vectors
with open('data/midi_feature_vectors.pkl', 'rb') as f:
    feature_vectors = pickle.load(f)

# Preparing dataset and dataloader
dataset = VectorDataset(feature_vectors)
dataloader = DataLoader(dataset, batch_size=2048, shuffle=True)

# Model, loss, and optimizer setup
input_size = len(next(iter(feature_vectors.values())))
autoencoder = Autoencoder(input_size=input_size).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=1e-4)

# Training the model
train_autoencoder(autoencoder, dataloader, criterion, optimizer, num_epochs=1_000_000, device=device)

In [49]:
# Save the model
torch.save(autoencoder.state_dict(), 'models/midi_autoencoder_weights.pth')

In [50]:
# generate the embeddings
autoencoder.eval()
embeddings = {}
for key, vector in feature_vectors.items():
    vector = torch.tensor([vector], dtype=torch.float).to(device)
    embedding = autoencoder.encoder(vector).detach().cpu().numpy()
    embeddings[key] = embedding

# save embeddings
with open('data/midi_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

  vector = torch.tensor([vector], dtype=torch.float).to(device)


In [1]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size, num_layers=1):
        """
        Initializes the modified LSTM model to accept concatenated word and MIDI embeddings as input.

        Parameters:
        - input_dim (int): The dimensionality of the concatenated input vector (word embedding + MIDI embedding).
        - hidden_dim (int): The number of features in the hidden state `h` of the LSTM.
        - vocab_size (int): The size of the vocabulary, used for the output layer dimension.
        - num_layers (int): Number of recurrent layers (default=1).
        
        The input to the model is expected to be a batch of concatenated word and MIDI embeddings,
        and the output is a batch of predictions for the next word.
        """
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # LSTM layer takes concatenated embeddings as inputs
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

        # Linear layer that maps from hidden state space to vocabulary space
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, concatenated_embeddings):
        """
        Defines the forward pass of the model using concatenated word and MIDI embeddings.
        
        Parameters:
        - concatenated_embeddings: A batch of concatenated word and MIDI embeddings.
        
        Returns:
        - output: The model's predictions for the next word.
        """
        lstm_out, _ = self.lstm(concatenated_embeddings)
        output = self.linear(lstm_out)
        return output


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter


In [3]:
def train_model(model, train_loader, config):
    """
    Trains the LSTM model on the given dataset, using preprocessed concatenated embeddings.

    Parameters:
    - model (torch.nn.Module): The LSTM model to be trained.
    - train_loader (DataLoader): DataLoader for the training dataset, providing batches of preprocessed inputs and targets.
    - config (dict): Configuration parameters including epochs, learning rate, device, etc.

    The DataLoader is expected to yield batches of (input_features, target_words), where:
    - input_features are the concatenated word and MIDI embeddings,
    - target_words are the indices of the target words to predict.
    """
    writer = SummaryWriter(f"runs/{config['experiment_name']}")
    model.train()  # Ensure the model is in training mode
    optimizer = Adam(model.parameters(), lr=config["learning_rate"])
    criterion = CrossEntropyLoss()  # Appropriate for classification tasks
    model.to(config["device"])  # Move model to configured device (CPU/GPU)

    for epoch in range(config["epochs"]):
        total_loss = 0.0
        # Enable or disable the progress bar based on verbosity setting
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False, disable=not config.get("verbosity", False))
        
        for i, (input_features, target_words) in progress_bar:
            input_features, target_words = input_features.to(config["device"]), target_words.to(config["device"])
            
            optimizer.zero_grad()  # Clear gradients
            outputs = model(input_features)  # Forward pass
            loss = criterion(outputs.view(-1, config["vocab_size"]), target_words.view(-1))  # Compute loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights

            total_loss += loss.item()  # Accumulate loss
            progress_bar.set_description(f"Epoch {epoch+1} Loss: {total_loss/(i+1):.4f}")
        
        avg_loss = total_loss / len(train_loader)  # Calculate average loss
        writer.add_scalar('training_loss', avg_loss, epoch+1)  # Log to TensorBoard
        print(f"Epoch {epoch+1} Completed. Avg Loss: {avg_loss:.4f}")


In [2]:
import pickle
from gensim.models import KeyedVectors
import numpy as np

# Load the pre-trained Word2Vec model
# Make sure to adjust the path to where your Word2Vec model is stored
word2vec_model_path = 'models/word2vec-google-news-300.model'
word2vec_model = KeyedVectors.load(word2vec_model_path)

# Load the lyrics dictionary
with open('data/lyrics_dict.pkl', 'rb') as f:
    lyrics_dict = pickle.load(f)

# Initialize the dictionary to store embeddings
lyrics_embeddings = {}

# Convert word indices to embeddings
for song_artist, word_ids in lyrics_dict.items():
    # Convert each word id to its corresponding embedding
    word_embeddings = []
    for word_id in word_ids:
        word = word2vec_model.index_to_key[word_id]  # Convert index to word
        if word in word2vec_model:
            word_embeddings.append(word2vec_model[word])
        else:
            print('.', end='')
            # Handle out-of-vocabulary words
            # Here, you can choose to skip or use a zero vector; let's use a zero vector
            word_embeddings.append(np.zeros(word2vec_model.vector_size))

    lyrics_embeddings[song_artist] = word_embeddings

# Save the embeddings dictionary
with open('data/lyrics_embeddings.pkl', 'wb') as f:
    pickle.dump(lyrics_embeddings, f)

print("Lyrics embeddings saved successfully.")


Lyrics embeddings saved successfully.


In [4]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np

class LyricsMIDIDataset(Dataset):
    def __init__(self, lyrics_dict=None, midi_embeddings=None, word2vec_model=None, preloaded_inputs=None, preloaded_targets=None):
        """
        Initializes the dataset with either raw data to be processed or preloaded processed data.
        
        Parameters:
        - lyrics_dict (dict): Raw lyrics data.
        - midi_embeddings (dict): Raw MIDI embeddings.
        - word2vec_model: The Word2Vec model.
        - preloaded_inputs (torch.Tensor): Preloaded inputs tensor.
        - preloaded_targets (torch.Tensor): Preloaded targets tensor.
        """
        if preloaded_inputs is not None and preloaded_targets is not None:
            self.inputs = preloaded_inputs
            self.targets = preloaded_targets
        else:
            self.inputs = []
            self.targets = []
            # get all songs that arent in keys of midi_embeddings or lyrics_dict
            self.missing_songs = set(lyrics_dict.keys()) ^ set(midi_embeddings.keys())

            for song_key, word_indices in lyrics_dict.items():
                if song_key not in midi_embeddings:
                    continue  # Skip songs without a corresponding MIDI embedding
                midi_embedding = midi_embeddings[song_key]  # MIDI embedding for the current song

                for i in range(len(word_indices) - 1):
                    # Convert word indices to embeddings
                    word_embedding_current = word2vec_model[word_indices[i]]
                    next_word_indice = word_indices[i + 1]

                    # Concatenate word embedding with MIDI embedding for the input
                    word_embedding_current = word_embedding_current.reshape(1, -1)  # Reshape to (1, embedding_dim)
                    input_feature = np.concatenate([word_embedding_current, midi_embedding], axis=1)
                    self.inputs.append(input_feature)
                    self.targets.append(next_word_indice)

            # Convert lists to tensors for PyTorch compatibility
            self.inputs = torch.tensor(self.inputs, dtype=torch.float)
            self.targets = torch.tensor(self.targets, dtype=torch.long)

    def __len__(self):
        """Returns the total number of input-target pairs."""
        return len(self.inputs)

    def __getitem__(self, idx):
        """
        Returns an input-target pair by index.
        
        Parameters:
        - idx (int): The index of the input-target pair.
        
        Returns:
        - tuple: A tuple containing the input feature tensor and target tensor.
        """
        return self.inputs[idx], self.targets[idx]

In [3]:
from gensim.models import KeyedVectors

word2vec_model_path = 'models/word2vec-google-news-300.model'  # Adjust this path
word2vec_model = KeyedVectors.load(word2vec_model_path)


In [4]:
import pickle

# Adjust these paths to where your files are stored
lyrics_dict_path = 'data/lyrics_dict.pkl'
midi_embeddings_path = 'data/midi_embeddings.pkl'

with open(lyrics_dict_path, 'rb') as f:
    lyrics_dict = pickle.load(f)

with open(midi_embeddings_path, 'rb') as f:
    midi_embeddings = pickle.load(f)


In [8]:
config = {
    "batch_size": 8,
    "learning_rate": 0.001,
    "hidden_dim": 128,
    "num_layers": 2,
    "embedding_dim": 300,  # Assuming we know the embedding dimension
    "vocab_size": 3000000,  # Make sure this matches the actual vocab size
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "shuffle": False,
    "epochs": 10,
    'verbosity': True
}

# clear cached memory if config['device'] is cuda
if config['device'] == torch.device('cuda'):
    torch.cuda.empty_cache()


config["experiment_name"] = f"lstm_{config['hidden_dim']}_{config['num_layers']}_{config['learning_rate']}"

In [None]:
dataset = LyricsMIDIDataset(lyrics_dict=lyrics_dict, midi_embeddings=midi_embeddings, word2vec_model=word2vec_model)

torch.save(dataset.inputs, 'data/dataset_saved_inputs.pt')
torch.save(dataset.targets, 'data/dataset_saved_targets.pt')


In [14]:
preloaded_inputs = torch.load('data/dataset_saved_inputs.pt')
preloaded_targets = torch.load('data/dataset_saved_targets.pt')
preloaded_targets = torch.tensor(preloaded_targets, dtype=torch.long)  # Ensure targets are long

# Initialize the dataset with the preloaded data
dataset = LyricsMIDIDataset(preloaded_inputs=preloaded_inputs, preloaded_targets=preloaded_targets)


  preloaded_targets = torch.tensor(preloaded_targets, dtype=torch.long)  # Ensure targets are long


In [15]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=config['shuffle'])

In [16]:
# get the input dimension
input_dim = dataset.inputs.shape[2]
input_dim

1324

In [17]:
# train the model, using config to set the number of epochs, learning rate, etc.
lstm_model = LSTMModel(input_dim=input_dim, hidden_dim=config['hidden_dim'], vocab_size=config['vocab_size'], num_layers=config['num_layers'])


In [None]:
train_model(lstm_model, train_loader, config)